python网站内容爬虫,图片本地存储,再也不用为网站内容担心了!!

很多站长在网站完成后总是会因为网站内容而焦头烂额,不知道有什么工具能够完善网站内容,今天就用一个python的爬虫程序,为你的网站爬取内容,以下是爬虫的python程序.
你可以根据你的需要设置你想爬取的网站,通过简单的xpath就可以对网站内容进行提取,并对文章的中的图片进行本地化存储。

import requests
from lxml import etree
import pymysql
import re
import time

def getdata(url):
    res = requests.get(url)
    return res

获取分页链接

def getpage(res):
    html = etree.HTML(res.text)
    html_data = html.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/a/@href') # 记得改成自己的内容
    pagelist = []
    pagelist.append(url)
    print('获取分页')
    for i in html_data:
        pagelist.append('http://www.yousite.com'+i)
        print('当前分页:', 'http://www.yousite.com' + i)
    return pagelist

#获取内容url
def getcontenturl(urllist):
    content_url = []
    print('获取正链接')
    for url in urllist:
        html_text = requests.get(url).text
        page_url = etree.HTML(html_text).xpath('/html/body/div[3]/div[2]/div[2]/ul/li[*]/a/@href') # 记得改成自己的内容
        for i in page_url:
            content_url.append('http://www.yousite.com'+i)
            print('当前正文页链接','http://www.yousite.com'+i)
    return content_url


#将str或字节并始终返回str
def to_str(bytes_or_str):
  if isinstance(bytes_or_str, bytes):
    value = bytes_or_str.decode('utf-8')
  else:
    value = bytes_or_str
  return value

正文内容解析

def analysis(content_url):
    print('正文内容解析')
    data = []
    for url in content_url:
        text = requests.get(url)
        text.encoding = 'utf-8'
        title = etree.HTML(text.text).xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span/text()')[0] # 解析标题 # 记得改成自己的内容
        # print(title)
        content = etree.HTML(text.text).xpath('/html/body/div[3]/div[2]/div[2]/div[2]')[0]
        content = etree.tostring(content,encoding='utf-8', method='html')
        content = to_str(content)
        # print('正文',content)
        createtime = url.split('/')[-2]
        timeArray = time.strptime(createtime,'%Y-%m-%d')
        timeStamp = int(time.mktime(timeArray))

        print('时间',timeStamp)

        abstract_str = etree.HTML(text.text).xpath('/html/body/div[3]/div[2]/div[2]/div[2]//text()') 
        abstract =''
        for i in abstract_str:
            abstract = abstract + i
        print(abstract)
        image = ''
        pic_url = re.findall('img.*?src="(.*?)"', content, re.S)
        if pic_url:
            for img in pic_url:
               key = 0

               print(img)
               # 图片本地保存
               try:
                   res = requests.get('http://www.yousite.com'+img)
                   img_data = res.content
                   file = img.split('/',-1)[-1]
                   path = 'images/'
                   content =  content.replace(img,'/uploads/news/'+file)


                   if(key == 0):
                     image = '/uploads/news/'+file


                   with open(path+file, 'wb') as f:
                       f.write(img_data)
               except:
                   pass
               key = key + 1



        content_data = {'name':title,'newscategory_id':'5','content':content,'abstract':abstract[0:100],'image':image,'createtime':timeStamp}
        data.append(content_data)
    return data

mysql数据库链接

def seavdata(content_data):
    pymysql.charset = 'utf-8'
    connection = pymysql.connect(
        host='localhost',  # 主机地址
        user='root',  # 用户名
        password='',  # 密码
        db='yljiankang',  # 数据库名
    )

数据插入

  curror = connection.cursor()
    data =[]
    sql = 'insert into fa_news(name,newscategory_id,content,abstract,image,createtime) values(%s,%s,%s,%s,%s,%s);'
    for item in content_data:
        tem = (item['name'],item['newscategory_id'],item['content'],item['abstract'],item['image'],item['createtime'])
        data.append(tem)
    curror.executemany(sql, data)
    connection.commit()

主函数调用

if __name__ == '__main__':
    url = 'http://www.yousite.com/xxx/' # 换成你要爬取的网站列表页地址
    res = getdata(url) # 获取到所有分页列表页
    page = getpage(res) # 
    content_url = getcontenturl(page) #获取列表页中的正文链接地址
    data = analysis(content_url) # 解析正文页
    seavdata(data)

你可能感兴趣的:(python网站内容爬虫,图片本地存储,再也不用为网站内容担心了!!)