PyQuery

PyQuery库

初始化

字符串初始化

参考崔庆才爬虫

from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) #转换为标签树 print(doc('p')) #选择标签 print(doc('#link1')) #标签选择器 print(doc('.title')) #类样式选择器 print(doc('.story #link2')) #选择类样式为story下id为link2的标签

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

Elsie,

The Dormouse's story

Lacie and

url初始化

from pyquery import PyQuery as pq

doc=pq(url="http://www.baidu.com",encoding='utf-8')
print(doc('head'))
百度一下,你就知道 

文档初始化

from pyquery import PyQuery as pq
doc=pq(".\\Text\\upload\\HTML.html")
print(doc('table'))

CSS选择器

from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) print(doc('.title')) #类样式选择器 print(doc('#123')) #id选择器 print(doc('#123 #table .ul')) #嵌套选择,选择id为123的标签下id为table的标签下class为ul的标签

The Dormouse's story

我是table
    我是ul
    我是ul

查找元素

子元素

from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) div=doc.find('div') print(div) print(type(div)) ul=div.find('ul') print(type(ul)) print(ul)
我是table
    我是ul
    我是ul
t=doc.find('#123.div #table')        #选取id="123"并且class="div"的标签下id=table的标签
print(t)

我是table
    我是ul
from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) div=doc.children() print(type(div)) print(div)

The Dormouse's story

The Dormouse's story

我是table
    我是ul

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

父元素

from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) ul=doc.find("#t") parent=ul.parent() print(parent)

我是table
    我是ul
doc=pq(html)
ul=doc.find('#t')
parents=ul.parents()
print(type(parents))            #所有祖先节点
print(parents)

The Dormouse's story

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

我是table
    我是ul
    ulllll
我是table
    我是ul
    ulllll

兄弟节点

from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) siblings=doc.find('p').siblings('#123') #获取所有id="123"的兄弟节点 print(siblings)
我是table
    我是ul
    ulllll
我是table
    我是ul
    ulllll
我是table
    我是ul
    ulllll

遍历

单个元素
from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) print(type(doc.find('#123'))) print(doc.find('#123'))

我是table
    我是ul
    ulllll
多个元素
from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) ls=doc.find('a').items() #.items()生成一个迭代器类型可用于遍历 for p in ls: print(p)
Elsie,

Lacie and

Tillie;
and they lived at the bottom of a well.

获取信息

获取属性
from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) l=doc.find('.story') print(l.attr('name')) #获取name属性,其他属性填入即可 print(l.attr.name)
我是一个p
我是一个p
获取文本
from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) div=doc.find('#123.div') print(div) print(div.text()) #获取文本内容
我是table
    我是ul
    ulllll
我是table 我是ul ulllll

获取HTML

from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) a=doc.find('a') print(a) print(a.html())
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
Elsie

DOM操作

addClass removeClass

from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""
doc=pq(html) p=doc.find('p') p.removeClass('title') #移除类样式class=title print(p) print("........") p.addClass('newTitle') #添加类样式class=newTitle print(p)

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

........

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

attr,css

from pyquery import PyQuery as pq

html="""
The Dormouse's story

The Dormouse's story

我是table
    我是ul
    ulllll

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

我是span

...

"""
doc=pq(html) span=doc.find('span') print(span) span.attr('name','span') #添加name属性 print(span) span.attr('class','sd') #添加类样式 print(span) span.attr('class','') #令class为空,但是class属性存在 print(span) span.css('font-size','14px') #添加style样式 print(span)
我是span

我是span

我是span

我是span

我是span
remove
html="""


我是span
我是a



"""
from pyquery import PyQuery as pq

doc=pq(html)
span=doc.find('.span')
print(span)
t=span.find('a').remove()
print(t.text())

我是span
我是a


我是a

伪类选择器

from pyquery import PyQuery as pq

html="""

    
    
        

p1

p2

p3

p4

p5

p6

  • name
  • """
    doc=pq(html) l1=doc("p:first-child") #第一个p print(l1) print("............") l2=doc("p:last-child") #最后一个p print(l2) print("...........") l3=doc("p:nth-child(2)") #第二个p print(l3) print("...........") l4=doc("p:gt(3)") #第四个之后(不包括四) print(l4) print("...........") l5=doc("p:nth-child(2n)") #第偶数个 print(l5) print(".............") l6=doc("li:contains(na)") #选择文本中含有name的标签 print(l6)

    p1

    ............ ...........

    p2

    ...........

    p5

    p6

    ...........

    p2

    p4

    p6

    .............
  • name
  • 
    

    你可能感兴趣的:(Python,Python,爬虫)