出处:http://thehtmldom.sourceforge.net/#getting_started
htmldom parses the HTML file and provides methods for iterating and searching the parse tree in a similar way as Jquery.
Language Requirement: Python 3.2.x
Platforms Available: Linux, Windows
from htmldom import htmldom dom = htmldom.HtmlDom() #or dom = htmldom.HtmlDom( "http://www.example.com" )
The above code creates a HtmlDom object.The HtmlDom takes a default parameter, the url of the page. If not provided you can create elements dynamically.
dom = dom.createDom("<html></html>') #or, if you have provided the url then just createDom() call will suffice dom = dom.createDom()
Once the dom object is created, you need to call createDom method of HtmlDom. This will parse the html data and constructs the parse tree which then can be used for searching and manipulating the html data. The only restriction the library imposes is that the data whether it is html or xml must have a root element.
Selector expression | Meaning |
---|---|
* | Universal Selector |
E | Matched any element E |
E F | Matches any F element that is a descendant of an E element. |
E > F | Matches any F element that is a child of an element E. |
E + F | Matches any F element immediately preceded by a sibling element E. |
E[foo] | Matches any E element with the "foo" attribute set (whatever the value). |
E[foo=value] | Matches any E element whose "foo" attribute value is exactly equal to "value". |
E[foo~=value] | Matches any E element whose "foo" attribute value is a list of space-separated values, one of which is exactly equal to "value". |
E.dummy | Matches any element which has class attribute and have a value of "dummy". |
E#dummy | Matches any element which has id attribute and have a value of "dummy". |
#create a dom instance from htmldom import htmldom dom = htmldom.HtmlDom().createDom( """<html> <div id='one'><p>This is paragraph<strong>strong Element</strong></p></div> <div id='two'><p>This is paragraph<strong>strong Element</strong></p></div> <p id='three'><p>This is paragraph<strong>strong Element</strong></p></p> <h4 id='four'><p>This is paragraph<strong>strong Element</strong></p></h4></html>""") " ) # Getting p element from html data p = dom.find( "p" ) # You can print html content using "html" method of HtmlNodeList object print( p.html() ) # Getting all elements all = dom.find( "*" ) # Getting sibling elements using '+' sibling = dom.find( "div + div" ) # Getting Descendant element desc = dom.find( "div p strong" ) # Getting child element using '>' child = dom.find( "div > p > strong" ) # Selecting elements through attributes elem = dom.find( "div[id=one]" ) #or elem = dom.find( "[id]" ) #or elem = dom.find( "div[id] p" ) #or elem = dom.find( "div#one" ) #If 'one' were a class then, elem = dom.find( "div.one" )
from htmldom import htmldom dom = htmldom.HtmlDom( "http://www.example.com" ).createDom() # Find all the links present on a page and prints its "href" value a = dom.find( "a" ) for link in a: print( link.attr( "href" ) )
#Using the dom instance from the above code snippet div = dom.find( "div" ) # Gets all the children chldrn = div.children() #or, select only those children which have class 'dummy' chldrn = div.children( ".dummy" )
dom = htmldom.HtmlDom().createDom( """<html> <div id='one'><p>This is paragraph<strong>strong Element</strong></p></div> <div id='two'><p>This is paragraph<strong>strong Element</strong></p></div> <p id='three'><p>This is paragraph<strong>strong Element</strong></p></p> <h4 id='four'><p>This is paragraph<strong>strong Element</strong></p></h4></html>""") # Get first div`s html div = dom.find( "div" ).first().html() # div=<div id='one'><p>This is paragraph<strong>strong Element</strong></p></div> #replace first "div`s" content with "b" tag: dom.find( "div" ).html( "<b>b Element</b>" )
#Using the dom instance from the above code snippet dom.find( "div" ).first().text( "div contents replaced" )
#Using the dom instance from the above code snippet dom.find( "div" ).first().attr( "id" ) # returns "one" #Adding new attribute dom.find( "div" ).first().attr( "class", "dummy" )
#Using the dom instance from the above code snippet dom.find( "div" ).first().removeAttr( "id" )
#Using the dom instance from the above code snippet # Gets only that div which has id attribute with value "one" div_one = dom.find( "div" ).filter( "[id=one]" )
#Using the dom instance from the above code snippet # Remove div#one from the current div`s set div_not_one = dom.find( "div" )._not( "[id=one]" )
#Using the dom instance from the above code snippet div = dom.find( "div" ).eq( 0 ) # Using list index syntax div = dom.find( "div" )[0] # Slicing div = dom.find( "div" )[1:]
#Using the dom instance from the above code snippet. div_first = div.find( "div" ).first()
#Using the dom instance from the above code snippet. div_last = div.find( "div" ).last()
#Using the dom instance from the above code snippet. # Find all "div" elements which contain "strong" element(s) as its descendant. div = dom.find( "div" ).has( "strong" )
#Using the dom instance from the above code snippet strong = dom.find( "div" ).children().children() if strong._is( "strong" ): print( "strong element is in the set" ) else: print( "strong element is not in the set" )
dom = htmldom.HtmlDom().createDom( """<html> <div id='one'><p>This is paragraph<strong>strong Element</strong></p></div> <div id='two'><p>This is paragraph<strong>strong Element</strong></p></div> <p id='three'><p>This is paragraph<strong>strong Element</strong></p></p> <h4 id='four'><p>This is paragraph<strong>strong Element</strong></p></h4></html>""") # Gets next sibling elements of div element next = dom.find( "div" ).next() # next = [ div#two, p#three ] # Filtering the result set. next = dom.find( "div" ).next( "p#three" ) # next = [ p#three ] # Getting all the next elements of div next_all = dom.find( "div" ).nextAll() # next_all = [ div#two, p#three, h4#four ] # Filtering the result set. next_all = dom.find( "div" ).nextAll( "h4#three" ) # next_all = [ h4#four ] # Getting next sibling elements until div#one prevs = dom.find( "div#one" ).prevUntil( "h4" ) # prevs = [ div#two, p#three ]
dom = htmldom.HtmlDom().createDom( """<html> <div id='one'><p>This is paragraph<strong>strong Element</strong></p></div> <div id='two'><p>This is paragraph<strong>strong Element</strong></p></div> <p id='three'><p>This is paragraph<strong>strong Element</strong></p></p> <h4 id='four'><p>This is paragraph<strong>strong Element</strong></p></h4></html>""") # Gets previous sibling elements of div element. next = dom.find( "div" ).prev() # next = [ div#one ] # Filtering the result set. next = dom.find( "div" ).prev( "p#three" ) # next = [] # Getting all the prev elements of h4. next_all = dom.find( "h4" ).prevAll() # next_all = [ div#two, p#three, div#one ] # Filtering the result set. next_all = dom.find( "h4" ).prevAll( "#one" ) # next_all = [ div#one ] # Getting previous sibling elements until div#one. prevs = dom.find( "h4" ).prevUntil( "div#one" ) # prevs = [ div#two, p#three ]
#Using the dom instance from the above code snippet. siblings = dom.find( "div#two" ).siblings() #siblings = [ div#one, p#three, h4#four ] # Filtering the result set. siblings = dom.find( "div#two" ).siblings( "#three" ) #siblings = [ p#three ]
dom = htmldom.HtmlDom().createDom( """<html> <div id='one'><p id="five">This is paragraph<strong>strong Element</strong></p></div> <div id='two'><p id="six">This is paragraph<strong>strong Element</strong></p></div> <p id='three'><p id="seven">This is paragraph<strong>strong Element</strong></p></p> <h4 id='four'><p id="eight">This is paragraph<strong>strong Element</strong></p></h4></html> """) # Gets parent elements of strong element. parent = dom.find( "strong" ).parent() # parent = [ p#five, p#six, p#seven, p#eight ] # Filtering the result set. parent = dom.find( "strong" ).parent( "p#seven" ) # parent = [ p#seven ] # Getting all the parents elements of strong parents = dom.find( "strong" ).parents() # parent = [ div#two, p#three, div#one,p#five, p#six, p#seven, p#eight, html ] # Filtering the result set. parents = dom.find( "strong" ).prevAll( "#one" ) # parents = [ div#one ] # Getting parent elements until div#one. parent = dom.find( "strong" ).first().parentsUntil( "div#one" ) # parent = [ p#five ]
#Using the dom instance from the above code snippet. # First find all the strong elements. elems = dom.find( "strong" ) #then add p#three element to the set. elems.add( "p#three" )
#Using the dom instance from the above code snippet. elems = dom.find( "p" ).prev().andSelf() #elems = [ div#two, p#three ]
#Using the dom instance from the above code snippet. # First selects "html" element then finds "p", # adds a text node to it then revert back to the set containing "html" print( dom.find( "html" ).find( "p" ).append( "This is a paragraph" ).end().html() )
#Using the dom instance from the above code snippet. # Gets "p" element nested inside "html" element p = dom.find( "html" ).find( "p" )
#Using the dom instance from the above code snippet. # First select p elements p = dom.find( "p" ) # Then add "strong" elements to it. p_added = p.add( "strong" )
dom = htmldom.HtmlDom().createDom( """<html> <div id='one'><p id="five">This is paragraph<strong>strong Element</strong></p></div> <div id='two'><p id="six">This is paragraph<strong>strong Element</strong></p></div> <p id='three'><p id="seven">This is paragraph<strong>strong Element</strong></p></p> <h4 id='four'><p id="eight">This is paragraph<strong>strong Element</strong></p></h4></html> """) # Getting strong element strong = dom.find( "html div#one strong" )
#Using the dom instance from the above code snippet. dom.find( "div#one" ).append( "<b>b Element</b>" ) #or dom.find( "div#one" ).prepend( "<b>b Element</b>" ) #or dom.find( "div#one" ).after( "<b>b Element</b>" ) #or dom.find( "div#one" ).before( "<b>b Element</b>" ) # print its html to see the effect print( dom.find( "div#one" ).html() ) #or you can pass the HtmlNodeList object. dom.find( "div#one" ).append( dom.find( "div#two" ) ) #or dom.find( "div#one" ).prepend( dom.find( "div#two" ) ) #or dom.find( "div#one" ).after( dom.find( "div#two" ) ) #or dom.find( "div#one" ).before( dom.find( "div#two" ) ) # print its html to see the effect print( dom.find( "div#one" ).html() ) # Here "div#one" will be appended to "div#two" dom.find( "div#one" ).appendTo( dom.find( "div#two" ) ) # Here "div#one" will be prepended to "div#two" dom.find( "div#one" ).prependTo( dom.find( "div#two" ) ) # Here "div#one" will be attached as next sibling to "div#two" dom.find( "div#one" ).insertAfter( dom.find( "div#two" ) ) # Here "div#one" will be attached as next sibling of "div#two" dom.find( "div#one" ).insertAfter( dom.find( "div#two" ) ) # Here "div#one" will be attached as previous sibling of "div#two" dom.find( "div#one" ).insertBefore( dom.find( "div#two" ) ) # print its html to see the effect print( dom.find( "div#two" ).html() )