Beautiful Soup 是一个HTML/XML的解析器,主要的功能也是如何解析和提取 HTML/XML 数据。
lxml 只会局部遍历,而Beautiful Soup 是基于HTML DOM的,会载入整个文档,解析整个DOM树,因此时间和内存开销都会大很多,所以性能要低于lxml。
BeautifulSoup 用来解析 HTML 比较简单,API非常人性化,支持CSS选择器、Python标准库中的HTML解析器,也支持 lxml 的 XML解析器。
pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
import asyncio
import functools
import re
house_info = []
'''异步请求获取链家每页数据'''
async def get_page(page_index):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
request = functools.partial(requests.get, f'https://sh.lianjia.com/ershoufang/pudong/pg{page_index}/',
headers=headers)
loop = asyncio.get_running_loop()
response = await loop.run_in_executor(None, request)
return response
'''使用xpath获取房屋信息'''
def get_house_info(soup):
house_info_list = soup.select('.info') # 房屋title
reg = re.compile(r'\n|\s')
for html in house_info_list:
house_info.append({
'title': re.sub(reg,'',html.select('.title a')[0].getText()),
'house_pattern': re.sub(reg,'',html.select('.houseInfo')[0].getText()),
'price': re.sub(reg,'',html.select('.unitPrice')[0].getText()),
'location': re.sub(reg,'',html.select('.positionInfo')[0].getText()),
'total': re.sub(reg,'',html.select('.totalPrice')[0].getText())
})
'''异步获取第一页数据,拿到第一页房屋信息,并返回分页总数和当前页'''
async def get_first_page():
response = await get_page(1)
soup = BeautifulSoup(response.text, 'lxml')
get_house_info(soup)
print(house_info)
if __name__ == '__main__':
asyncio.run(get_first_page())
soup = BeautifulSoup(markup=“”, features=None, builder=None,parse_only=None, from_encoding=None, exclude_encodings=None,element_classes=None)
from bs4 import BeautifulSoup
html_str = '我是标题 我是div1我是div2我是div3'soup = BeautifulSoup(html_str, 'lxml')
print('title:',soup.title)
print('head:', soup.head)
print('body:', soup.body)
print('html:', soup.html)
print('find:', soup.find('div',attrs={'class':'div1'}))
print('find_all:', soup.find_all('div'))
print('select:', soup.select('.div1'))
print('get_text:', soup.select('.div1')[0].get_text())
print('get:', soup.select('.div1')[0].get('class'))
div1 = soup.select('.div1')[0]
print('find_parents:', div1.find_parents('div'))
print('find_next_sibling:', div1.find_next_sibling())
print('find_previous_sibling:', div1.find_previous_sibling())
print('find_next:', div1.find_next())
print('find_previous:', div1.find_previous())