很多站长在网站完成后总是会因为网站内容而焦头烂额,不知道有什么工具能够完善网站内容,今天就用一个python的爬虫程序,为你的网站爬取内容,以下是爬虫的python程序.
你可以根据你的需要设置你想爬取的网站,通过简单的xpath就可以对网站内容进行提取,并对文章的中的图片进行本地化存储。
import requests
from lxml import etree
import pymysql
import re
import time
def getdata(url):
res = requests.get(url)
return res
def getpage(res):
html = etree.HTML(res.text)
html_data = html.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/a/@href') # 记得改成自己的内容
pagelist = []
pagelist.append(url)
print('获取分页')
for i in html_data:
pagelist.append('http://www.yousite.com'+i)
print('当前分页:', 'http://www.yousite.com' + i)
return pagelist
#获取内容url
def getcontenturl(urllist):
content_url = []
print('获取正链接')
for url in urllist:
html_text = requests.get(url).text
page_url = etree.HTML(html_text).xpath('/html/body/div[3]/div[2]/div[2]/ul/li[*]/a/@href') # 记得改成自己的内容
for i in page_url:
content_url.append('http://www.yousite.com'+i)
print('当前正文页链接','http://www.yousite.com'+i)
return content_url
#将str或字节并始终返回str
def to_str(bytes_or_str):
if isinstance(bytes_or_str, bytes):
value = bytes_or_str.decode('utf-8')
else:
value = bytes_or_str
return value
def analysis(content_url):
print('正文内容解析')
data = []
for url in content_url:
text = requests.get(url)
text.encoding = 'utf-8'
title = etree.HTML(text.text).xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span/text()')[0] # 解析标题 # 记得改成自己的内容
# print(title)
content = etree.HTML(text.text).xpath('/html/body/div[3]/div[2]/div[2]/div[2]')[0]
content = etree.tostring(content,encoding='utf-8', method='html')
content = to_str(content)
# print('正文',content)
createtime = url.split('/')[-2]
timeArray = time.strptime(createtime,'%Y-%m-%d')
timeStamp = int(time.mktime(timeArray))
print('时间',timeStamp)
abstract_str = etree.HTML(text.text).xpath('/html/body/div[3]/div[2]/div[2]/div[2]//text()')
abstract =''
for i in abstract_str:
abstract = abstract + i
print(abstract)
image = ''
pic_url = re.findall('img.*?src="(.*?)"', content, re.S)
if pic_url:
for img in pic_url:
key = 0
print(img)
# 图片本地保存
try:
res = requests.get('http://www.yousite.com'+img)
img_data = res.content
file = img.split('/',-1)[-1]
path = 'images/'
content = content.replace(img,'/uploads/news/'+file)
if(key == 0):
image = '/uploads/news/'+file
with open(path+file, 'wb') as f:
f.write(img_data)
except:
pass
key = key + 1
content_data = {'name':title,'newscategory_id':'5','content':content,'abstract':abstract[0:100],'image':image,'createtime':timeStamp}
data.append(content_data)
return data
def seavdata(content_data):
pymysql.charset = 'utf-8'
connection = pymysql.connect(
host='localhost', # 主机地址
user='root', # 用户名
password='', # 密码
db='yljiankang', # 数据库名
)
curror = connection.cursor()
data =[]
sql = 'insert into fa_news(name,newscategory_id,content,abstract,image,createtime) values(%s,%s,%s,%s,%s,%s);'
for item in content_data:
tem = (item['name'],item['newscategory_id'],item['content'],item['abstract'],item['image'],item['createtime'])
data.append(tem)
curror.executemany(sql, data)
connection.commit()
if __name__ == '__main__':
url = 'http://www.yousite.com/xxx/' # 换成你要爬取的网站列表页地址
res = getdata(url) # 获取到所有分页列表页
page = getpage(res) #
content_url = getcontenturl(page) #获取列表页中的正文链接地址
data = analysis(content_url) # 解析正文页
seavdata(data)