{"id":1195,"date":"2024-11-19T19:14:33","date_gmt":"2024-11-19T11:14:33","guid":{"rendered":"https:\/\/gemmartdesign.com\/?p=1195"},"modified":"2024-11-29T20:29:41","modified_gmt":"2024-11-29T12:29:41","slug":"python-post13","status":"publish","type":"post","link":"https:\/\/gemmartdesign.com\/?p=1195","title":{"rendered":"Python\u554f\u984c\u7d00\u9304#13-\u4f7f\u7528Beautifuls]Soup\u6a21\u7d44\u548c\u8ff4\u5708\u6293\u53d6\u591a\u9801\u8cc7\u6599"},"content":{"rendered":"\n<h2 class=\"wp-block-heading\">\u672c\u5468\u76ee\u6a19<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u5b78\u7fd2WebCrawler\u57fa\u672c\u7528\u6cd5\u8207\u61c9\u7528<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u4efb\u52d9<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u5982\u4f55\u8a2a\u554f\u7db2\u7ad9\u52a0\u5165\u700f\u89bd\u5668\u8cc7\u8a0a\u4e26\u53d6\u5f97\u7db2\u7ad9\u8cc7\u6599<\/li>\n\n\n\n<li>\u4f7f\u7528Beautifulsoup\u6293\u53d6\u7db2\u7ad9\u6a19\u984c<\/li>\n\n\n\n<li>\u52a0\u5165cookie\u8a9e\u6cd5<\/li>\n\n\n\n<li>\u4f7f\u7528\u8ff4\u5708\u6293\u53d6\u591a\u9801\u8cc7\u6599<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">\u5c08\u6848\u7df4\u7fd2<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">\u76ee\u6a19: \u6293\u53d6PT\u516b\u5366\u72481-5\u9801\u9762\u6a19\u984c<\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u5efa\u7acb\u51fd\u5f0f\uff0c\u767c\u9001\u8acb\u6c42\u4e26\u53d6\u5f97\u8cc7\u6599<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import urllib.request as req\ndef getData(url):\n    request=req.Request(url,headers={\n        \"cookie\":\"over18=1\",\n        \"user-agent\":\"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/130.0.0.0 Safari\/537.36\"})\n    with req.urlopen(request) as response:\n        data=response.read().decode(\"utf-8\")<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u4f7f\u7528Beautifulsoip\u6a21\u7d44\u89e3\u6790\u7db2\u7ad9\u8cc7\u6599<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>    from bs4 import BeautifulSoup\n    root=BeautifulSoup(data,\"html.parser\")<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u6293\u53d6\u6587\u7ae0\u6a19\u984c<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>    titles=root.find_all(\"div\",class_=\"title\")\n    for title in titles:\n        if title.a:\n            print(title.a.string)<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u6ce8\u610f: \u932f\u8aa4\u7a0b\u5f0f\u78bc\uff0c\u6293\u4e0d\u5230\u8cc7\u6599<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>for title in titles:\n    if title.a:\n        print(titles.a.string)<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u932f\u8aa4\u8a0a\u606f<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>AttributeError: ResultSet object has no attribute 'a'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u8a9e\u6cd5\u8981\u524d\u5f8c\u547c\u61c9\uff0ctitles\u61c9\u8a72\u70batitle\uff0c\u56e0\u70ba\u6700\u5f8c\u5370\u51fa\u8cc7\u6599\u662f\u6bcf\u500btitle\uff0c\u4e0d\u662f\u6307\u627e\u5168\u90e8\u7684titles<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u4fee\u6b63\u5f8c\u8a9e\u6cd5<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>for title in titles:\n    if title.a:\n        print(title.a.string)<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u627e\u5230\u4e0a\u9801\u9023\u7d50\uff0c\u8b93\u7a0b\u5f0f\u81ea\u52d5\u6293\u53d6<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>    nextLink=root.find(\"a\",string=\"\u2039 \u4e0a\u9801\")\n    return nextLink&#91;\"href\"]<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u4e3b\u7a0b\u5f0f: \u4f7f\u7528\u8ff4\u5708\u722c\u53d6\u591a\u9801\u8cc7\u6599<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>pageURL=\"https:\/\/www.ptt.cc\/bbs\/Gossiping\/index.html\"\ncount=0\nwhile count&lt;5:\n    pageURL=\"https:\/\/www.ptt.cc\"+getData(pageURL)\n    count+=1<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u5b8c\u6574\u7a0b\u5f0f<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import urllib.request as req\ndef getData(url):\n    request=req.Request(url,headers={\n        \"cookie\":\"over18=1\",\n        \"user-agent\":\"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/130.0.0.0 Safari\/537.36\"})\n    with req.urlopen(request) as response:\n        data=response.read().decode(\"utf-8\")\n\n    from bs4 import BeautifulSoup\n    root=BeautifulSoup(data,\"html.parser\")\n    titles=root.find_all(\"div\",class_=\"title\")\n    for title in titles:\n        if title.a:\n            print(title.a.string)\n\n    nextLink=root.find(\"a\",string=\"\u2039 \u4e0a\u9801\")\n    return nextLink&#91;\"href\"]\n\npageURL=\"https:\/\/www.ptt.cc\/bbs\/Gossiping\/index.html\"\ncount=0\nwhile count&lt;5:\n    pageURL=\"https:\/\/www.ptt.cc\"+getData(pageURL)\n    count+=1<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">Output<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u56e0\u8cc7\u6599\u592a\u591a\uff0c\u4ee5\u4e0b\u793a\u7bc4\u5e7e\u884c<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>&#91; \u597d\u96f7] \u795e\u9b3c\u6230\u58eb2 \u7dca\u6263\u7f85\u99ac\u5efa\u57ce\u795e\u8a71\u7684\u6545\u4e8b     \n&#91;\u597d\u96f7] \u6771\u4eac\u653b\u7565&#91;\u6e2f\u7247]\uff082000\uff09\n&#91;\u554f\u7247] \u4e2d\u5e74\u7537\u627e\u8857\u908a\u5973\u90ce\u7d14\u804a\u5929\u7684\u96fb\u5f71\n&#91;\u60c5\u5831] \u5287\u5834\u7248\u300c\u9032\u64ca\u7684\u5de8\u4eba\u300d\u5b8c\u7d50\u7bc71\/3\u4e0a\u6620     \n&#91;\u554f\u7247] \u4e00\u90e8\u5075\u63a2\u5047\u88dd\u4e0a\u540a\u4f46\u5047\u6b7b\u7684\u5f71\u7247\n&#91;  \u96f7] \u6afb\u6843\u865f\n&#91;\u597d\u96f7]\u300a\u7d05\u8272\u4e00\u865f\u300b\u4ee5\u6eff\u6eff\u5fc3\u610f\uff0c\u6253\u78e8\u51fa\u8056\u8a95\u9b54\u529b\n&#91;\u65b0\u805e] \u300c\u571f\u751f\u82b1\u958b\u300d\u8a18\u9304\u4e0b\u91d1\u679d\u6f14\u793e \u91cd\u73fe\u5433\u670b\u5949\u73cd\u8cb4\u8eab\u5f71\n&#91;\u516c\u544a] \u96fb\u5f71\u677f\u677f\u898f 2022\/12\/5\n&#91;\u516c\u544a] \u7981\u653f\u6cbb\u7248\u898f \u53ca \u6295\u7968\u7d50\u679c<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u672c\u5468\u76ee\u6a19 \u5b78\u7fd2WebCrawler\u57fa\u672c\u7528\u6cd5\u8207\u61c9\u7528 \u4efb\u52d9 \u5c08\u6848&hellip;<\/p>\n","protected":false},"author":1,"featured_media":1229,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"content-type":"","pagelayer_contact_templates":[],"_pagelayer_content":"","_lmt_disableupdate":"","_lmt_disable":"","footnotes":""},"categories":[9],"tags":[],"class_list":["post-1195","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-python"],"_links":{"self":[{"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=\/wp\/v2\/posts\/1195","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1195"}],"version-history":[{"count":4,"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=\/wp\/v2\/posts\/1195\/revisions"}],"predecessor-version":[{"id":1237,"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=\/wp\/v2\/posts\/1195\/revisions\/1237"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=\/wp\/v2\/media\/1229"}],"wp:attachment":[{"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1195"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1195"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/gemmartdesign.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1195"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}