python-爬取古诗文网古诗

标题:爬取古诗文网古诗

# encoding=utf-8
import requests
import re

# 请求数据
def parse_page(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari'
                     '/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400'
}
    response = requests.get(url,headers)
    text = response.text
    # 注: . 是匹配不到 \n 的
    # 注: DOTALL 匹配所有的字符
    titles = re.findall(r'.*?(.*?)',text,re.DOTALL)
    print(titles)
    dynasties = re.findall(r'

.*?(.*?)',text,re.DOTALL) print(dynasties) authors = re.findall(r'

.*?.*?(.*?)',text,re.DOTALL) print(authors) content_tags = re.findall(r'

(.*?)
'
,text,re.DOTALL) # 定义列表,储存纯净的古诗文 contents = [] # 内容过多,可使用遍历 for content in content_tags: print(content) if __name__ == '__main__': # 去掉标签 x = re.sub(r'<.*?>',"",content) # strip 将 换行 空白 删去 # print(x.strip()) contents.append(x.strip()) # zip :将所有的列表将其组合生成 zip #a = [1,2] #b = [3,4] #c = zip(a,b) #c = [ # (1,3),(titles,dynasties,authors,contents) #将元祖组合成字典 # (2,4) #] #value = (1,2,3) #a,b,c = value # a=1 b=2 c=3 poems =[] for value in zip(titles,dynasties,authors,contents): title,dynasty,author,content = value # 将其分别装入字典中 poem = { 'title':title, 'dynasty':dynasty, 'author':author, 'content':content } poems.append(poem) for poem in poems: print(poem) print('='*80) def main(): url = 'https://www.gushiwen.org/' for x in range(1,11): # %s 控制页数 url = "https://www.gushiwen.org/default_%s.aspx" %x parse_page(url) if __name__ == '__main__': main()

你可能感兴趣的:(python-爬取古诗文网古诗)