参考崔庆才爬虫
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html) #转换为标签树
print(doc('p')) #选择标签
print(doc('#link1')) #标签选择器
print(doc('.title')) #类样式选择器
print(doc('.story #link2')) #选择类样式为story下id为link2的标签
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
Elsie,
The Dormouse's story
Lacie and
from pyquery import PyQuery as pq
doc=pq(url="http://www.baidu.com",encoding='utf-8')
print(doc('head'))
百度一下,你就知道
from pyquery import PyQuery as pq
doc=pq(".\\Text\\upload\\HTML.html")
print(doc('table'))
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
print(doc('.title')) #类样式选择器
print(doc('#123')) #id选择器
print(doc('#123 #table .ul')) #嵌套选择,选择id为123的标签下id为table的标签下class为ul的标签
The Dormouse's story
我是table
我是ul
我是ul
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
div=doc.find('div')
print(div)
print(type(div))
ul=div.find('ul')
print(type(ul))
print(ul)
我是table
我是ul
我是ul
t=doc.find('#123.div #table') #选取id="123"并且class="div"的标签下id=table的标签
print(t)
我是table
我是ul
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
div=doc.children()
print(type(div))
print(div)
The Dormouse's story
The Dormouse's story
我是table
我是ul
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
ul=doc.find("#t")
parent=ul.parent()
print(parent)
我是table
我是ul
doc=pq(html)
ul=doc.find('#t')
parents=ul.parents()
print(type(parents)) #所有祖先节点
print(parents)
The Dormouse's story
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
我是table
我是ul
ulllll
我是table
我是ul
ulllll
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
siblings=doc.find('p').siblings('#123') #获取所有id="123"的兄弟节点
print(siblings)
我是table
我是ul
ulllll
我是table
我是ul
ulllll
我是table
我是ul
ulllll
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
print(type(doc.find('#123')))
print(doc.find('#123'))
我是table
我是ul
ulllll
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
ls=doc.find('a').items() #.items()生成一个迭代器类型可用于遍历
for p in ls:
print(p)
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
l=doc.find('.story')
print(l.attr('name')) #获取name属性,其他属性填入即可
print(l.attr.name)
我是一个p
我是一个p
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
div=doc.find('#123.div')
print(div)
print(div.text()) #获取文本内容
我是table
我是ul
ulllll
我是table
我是ul
ulllll
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
a=doc.find('a')
print(a)
print(a.html())
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
Elsie
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
doc=pq(html)
p=doc.find('p')
p.removeClass('title') #移除类样式class=title
print(p)
print("........")
p.addClass('newTitle') #添加类样式class=newTitle
print(p)
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
........
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
from pyquery import PyQuery as pq
html="""
The Dormouse's story
The Dormouse's story
我是table
我是ul
ulllll
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
我是span
...
"""
doc=pq(html)
span=doc.find('span')
print(span)
span.attr('name','span') #添加name属性
print(span)
span.attr('class','sd') #添加类样式
print(span)
span.attr('class','') #令class为空,但是class属性存在
print(span)
span.css('font-size','14px') #添加style样式
print(span)
我是span
我是span
我是span
我是span
我是span
html="""
我是span
我是a
"""
from pyquery import PyQuery as pq
doc=pq(html)
span=doc.find('.span')
print(span)
t=span.find('a').remove()
print(t.text())
我是span
我是a
我是a
from pyquery import PyQuery as pq
html="""
p1
p2
p3
p4
p5
p6
name
"""
doc=pq(html)
l1=doc("p:first-child") #第一个p
print(l1)
print("............")
l2=doc("p:last-child") #最后一个p
print(l2)
print("...........")
l3=doc("p:nth-child(2)") #第二个p
print(l3)
print("...........")
l4=doc("p:gt(3)") #第四个之后(不包括四)
print(l4)
print("...........")
l5=doc("p:nth-child(2n)") #第偶数个
print(l5)
print(".............")
l6=doc("li:contains(na)") #选择文本中含有name的标签
print(l6)
p1
............
...........
p2
...........
p5
p6
...........
p2
p4
p6
.............
name