python爬虫爬取古诗词内容,并存入mysql

python爬虫爬取古诗词内容,并存入mysql

python爬虫爬取古诗词内容,并存入mysql_第1张图片
爬取结果展示:
python爬虫爬取古诗词内容,并存入mysql_第2张图片
代码如下:

from urllib import request
import re,os
import pymysql
import time

base_url = "https://so.gushiwen.org"
shiwen_url='https://www.gushiwen.org/shiwen/'

def get_model_url(shiwen_url):
    html=get_html(shiwen_url)
    res = '(.*?)'
    urls=re.compile(res).findall(html)
    return urls

def conn_mysql():
    url = '49.4.71.22'
    username = 'admin'
    password = 'admin963'
    dbname = 'test'
    db=pymysql.connect(url,username,password,dbname)
    return db

def createtable_poem():
    sql='create table if not exists poem(model_name varchar(50),poem_name varchar(50),' \
        'author_name varchar(50),dynasty varchar(50),content text)'
    db=conn_mysql()
    db.cursor().execute(sql)
    db.commit()

def get_html(url):
    html = request.urlopen(url).read().decode('utf-8')
    return html

def get_url_list(html):
    res='.*?.*?'
    url1=re.compile(res).findall(html)
    url_list=[]
    for u in url1:
        url_list.append(base_url+u)
    return  url_list

def get_poem_content(url):
    html=get_html(url)
    res='

(.*?)

\n

' \ '(.*?).*?(.*?)

\n
\n([\s\S]*?)\n
' poem_content=re.compile(res).findall(html) return poem_content if __name__ == '__main__': start = time.clock() db = conn_mysql() createtable_poem() model_name=[] i = 0 j=0 for g in get_model_url(shiwen_url): url='https://so.gushiwen.org/'+list(g)[0]+'.aspx' model_name.append(list(g)[1]) html=get_html(url) url_list=get_url_list(html) for s in url_list: i+=1 LL=get_poem_content(s) if len(LL)==0: pl=['NULL','NULL','NULL','NULL'] else: pl = LL[0] sql='insert into poem(model_name,poem_name,author_name,dynasty,content) values (%s,%s,%s,%s,%s)' data=[model_name[j],pl[0],pl[2],pl[1],re.sub('
|

|

','',pl[3])] db.cursor().execute(sql,data) db.commit() print('Success!!! '+'当前模块 :'+model_name[j]+'-----'+'已导入------'+str(i)+'条数据'+'-----') j+=1 db.close() print('Success!!!') print('End!!!') end = time.clock() print('Running time: %s Seconds' % (end - start))

解析主要使用re模块,正则匹配!实测可用!!!

你可能感兴趣的:(python爬虫)