爬虫学习4

PyQuery

初始化


%%html

<div id = "container">

<ul class="list">

<li class = "item-0">frist item</li>

<li class = "item-1"><a href="link2.html">second item</a></li>

<li class = "item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class = "item-1 active"><a href="link4.html">fourth item</a></li>

<li class = "item-0"><a href="link5.html">fifth item</a></li>

</ul>

</div>

字符串初始化


html = '''

<div id = "container">

<ul class="list">

<li class = "item-0">frist item</li>

<li class = "item-1"><a href="link2.html">second item</a></li>

<li class = "item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class = "item-1 active"><a href="link4.html">fourth item</a></li>

<li class = "item-0"><a href="link5.html">fifth item</a></li>

</ul>

</div>



'''

from pyquery import PyQuery as pq

doc = pq(html)

print(doc("li"))
<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

URL初始化


doc = pq(url="http://www.baidu.com")

print(doc("head").text().encode('iso8859-1').decode('utf8'))
百度一下,你就知道

文件初始化


doc = pq(filename = "test.html")

print(doc("li"))
<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

css选择器


doc = pq(html)

print(doc("#container .list li"))
<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

查找元素


doc = pq(html)

items = doc(".list")

print(type(items))

print(items)

print("------------------")



lis = items.find("li")

print(type(lis))

print(lis)
<class 'pyquery.pyquery.PyQuery'>

<ul class="list">

<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

</ul>



------------------

<class 'pyquery.pyquery.PyQuery'>

<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

子元素


lis1 = items.children()

print(lis1)

print("------------------")



lis2 = items.children(".active")

print(lis2)
<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>



------------------

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

父级元素


parent = items.children().parent()

print(parent)
<ul class="list">

<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

</ul>


parents =items.children().parents()

print(parents)

print("------------------")

print(parents(".list"))
<div id="container">

<ul class="list">

<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

</ul>

</div><ul class="list">

<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

</ul>



------------------

<ul class="list">

<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

</ul>

<ul class="list">

<li class="item-0">frist item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

</ul>

兄弟元素


third = doc(".list .item-0.active")

print(third)

print("----------")

brothers = third.siblings()

print(brothers)
<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>



----------

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0">frist item</li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

遍历


doc = pq(html)

lis = doc("li").items()

print(type(lis))

for li in lis:

    print(li)

    print("---")
<class 'generator'>

<li class="item-0">frist item</li>



---

<li class="item-1"><a href="link2.html">second item</a></li>



---

<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>



---

<li class="item-1 active"><a href="link4.html">fourth item</a></li>



---

<li class="item-0"><a href="link5.html">fifth item</a></li>



---

获取信息

获取属性


doc = pq(html)

a = doc(".item-0.active a")

print(a)

print("---")

print(a.attr("style"))

print("---")

print(a.attr.href)
<a href="link3.html" style="color:black;"><span class="bold">third item</span></a>

---

color:black;

---

link3.html

获取文本


texts = doc.items()

for text in texts:

    print(text.text())
frist item

second item

third item

fourth item

fifth item

获取html


a = doc(".item-0.active a")

print(a)

print("---")

print(a.html())
<a href="link3.html" style="color:black;"><span class="bold">third item</span></a>

---

<span class="bold">third item</span>

addClass,removeClass


doc = pq(html)

li = doc(".item-0.active")

print(li)

li.removeClass('active')

print(li)

li.addClass('active')

print(li)
<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>



<li class="item-0"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>



<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>

attr,css


doc = pq(html)

li = doc(".item-0.active")

print(li)

li.attr('name','link')

print(li)

li("a").css('font-size','14px')

print(li)
<li class="item-0 active"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>



<li class="item-0 active" name="link"><a href="link3.html" style="color:black;"><span class="bold">third item</span></a></li>



<li class="item-0 active" name="link"><a href="link3.html" style="color:black; font-size: 14px"><span class="bold">third item</span></a></li>

remove


doc =pq(html)

li = doc("li")

print(li.text())

li.find('a').remove()

print(li.text())
frist item second item third item fourth item fifth item

frist item    

API


%%html

<pre style="line-height: 1.25; white-space: pre;">

        \          SORRY            /

         \                         /

          \    This page does     /

           ]   not exist yet.    [    ,'|

           ]                     [   /  |

           ]___               ___[ ,'   |

           ]  ]\             /[  [ |:   |

           ]  ] \           / [  [ |:   |

           ]  ]  ]         [  [  [ |:   |

           ]  ]  ]__     __[  [  [ |:   |

           ]  ]  ] ]\ _ /[ [  [  [ |:   |

           ]  ]  ] ] (#) [ [  [  [ :===='

           ]  ]  ]_].nHn.[_[  [  [

           ]  ]  ]  HHHHH. [  [  [

           ]  ] /   `HH("N  \ [  [

           ]__]/     HHH  "  \[__[

           ]         NNN         [

           ]         N/"         [

           ]         N H         [

          /          N            \

         /           q,            \

        /                           \

</pre>

伪类选择器


doc = pq(html)

li = doc("li:first-child")

print(li.text())

li = doc("li:last-child")

print(li.text())

li = doc("li:nth-child(2)")

print(li.text())

li = doc("li:nth-child(3n)")

print(li.text())

li = doc("li:gt(3)")#取第5个,即第4个以后,第四个下标为3

print(li.text())

li = doc("li:contains(second)")#搜索文本

print(li.text())
frist item

fifth item

second item

third item

fifth item

second item

css选择器


CC BY-NC 4.0

数组
爬虫学习3-网页解析器

Comments