{"id":20845,"date":"2023-05-28T23:16:55","date_gmt":"2023-05-28T14:16:55","guid":{"rendered":"http:\/\/www.code-magagine.com\/?p=20845"},"modified":"2023-06-25T22:56:20","modified_gmt":"2023-06-25T13:56:20","slug":"%e3%80%90python%e3%80%91%e3%80%8cbeautifulsoup%e3%80%8d%e3%81%ab%e3%81%a4%e3%81%84%e3%81%a6","status":"publish","type":"post","link":"http:\/\/www.code-magagine.com\/?p=20845","title":{"rendered":"\u3010Python\u3011\u300cBeautifulSoup\u300d\u306b\u3064\u3044\u3066"},"content":{"rendered":"\n<h2 class=\"wp-block-heading\">\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb<\/h2>\n\n\n\n<pre class=\"wp-block-preformatted\">conda install beautifulsoup4\n\n\u307e\u305f\u306f\n\npip install beautifulsoup4<\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">\u30a4\u30f3\u30dd\u30fc\u30c8<\/h2>\n\n\n\n<pre class=\"wp-block-preformatted\">from bs4 import BeautifulSoup\nimport requests<\/pre>\n\n\n\n<p>BeautifulSoup\u306e\u4ed6\u306bRequests\u3092import\u3057\u307e\u3059\u3002\uff08\u3082\u3057\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3057\u3066\u3044\u306a\u3044\u5834\u5408\u306f\u4e8b\u524d\u306bpip\u306a\u3069\u3067\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3057\u3066\u304a\u304f\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002\uff09<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u69cb\u6587<\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code>BeautifulSoup(\u89e3\u6790\u5bfe\u8c61\u306eHTML\/XML,\u5229\u7528\u3059\u308b\u30d1\u30fc\u30b5\u30fc)<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">BeautifulSoup\u3067\u5229\u7528\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u30d1\u30fc\u30b5\u30fc<\/h3>\n\n\n\n<figure class=\"wp-block-table\"><table><tbody><tr><td>\u30d1\u30fc\u30b5\u30fc<\/td><td>\u6307\u5b9a\u65b9\u6cd5<\/td><td>\u7279\u5fb4<\/td><\/tr><tr><td>Python's html.parser<\/td><td>\"html.parser\"<\/td><td>\u8ffd\u52a0\u30e9\u30a4\u30d6\u30e9\u30ea\u304c\u4e0d\u8981<\/td><\/tr><tr><td>lxml's HTML parser<\/td><td>\"lxml\"<\/td><td>\u9ad8\u901f\u306b\u51e6\u7406\u304c\u53ef\u80fd<\/td><\/tr><tr><td>lxml's XML parser<\/td><td>\"xml\"<\/td><td>XML\u306b\u5bfe\u5fdc\u3057\u3001\u9ad8\u901f\u306b\u51e6\u7406\u304c\u53ef\u80fd<\/td><\/tr><tr><td>html5lib<\/td><td>\"html5lib\"<\/td><td>\u6b63\u3057\u304fHTML5\u3092\u51e6\u7406\u53ef\u80fd<\/td><\/tr><\/tbody><\/table><figcaption class=\"wp-element-caption\">\u4f55\u3082\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3057\u3066\u3044\u306a\u3044\u74b0\u5883\u3060\u3068\u30c7\u30d5\u30a9\u30eb\u30c8\u304c\u300chtml.parser\u300d\u306b\u306a\u308a\u307e\u3059\u304c\u3001\u4f8b\u3048\u3070lxml\u3092\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3057\u3066\u3044\u308b\u74b0\u5883\u3060\u3068\u81ea\u52d5\u7684\u306b\u4ed6\u306e\u30d1\u30fc\u30b5\u30fc\u304c\u9078\u629e\u3055\u308c\u3066\u3057\u307e\u3046\u53ef\u80fd\u6027\u304c\u3042\u308a\u307e\u3059\u3002\u306a\u306e\u3067\u57fa\u672c\u7684\u306b\u660e\u793a\u7684\u306b\u6307\u5b9a\u3059\u308b\u3088\u3046\u306b\u3057\u307e\u3057\u3087\u3046\u3002<\/figcaption><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">\u5b9f\u88c5<\/h2>\n\n\n\n<p>Request\u3068\u7d44\u307f\u5408\u308f\u305b\u307e\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">response = requests.get('\u30b5\u30a4\u30c8\u306eURL')\n\nif response.status_code == 200:\n  soup = BeautifulSoup(r.text) # HTML\u306e\u89e3\u6790\n  soup.h2 # \u6700\u521d\u306eh2\u3092\u53d6\u5f97\u3059\u308b\u3002\nelse:\n    raise # print\u306a\u3069\u3067\u30ed\u30b0\u51fa\u529b\u3067\u3082\u826f\u3044\u3067\u3059\u3002<\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">\u30bf\u30b0\u53d6\u5f97\u30e1\u30bd\u30c3\u30c9<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">\u5358\u6570\u306e\u30bf\u30b0\u3092\u53d6\u5f97<\/h3>\n\n\n\n<pre class=\"wp-block-preformatted\"># h2\u3092\u53d6\u5f97\nsoup.h2\n\n\u307e\u305f\u306f\n\nsoup.find('h2')<\/pre>\n\n\n\n<p>\u6700\u521d\u306eh2\u3092\u53d6\u5f97\u3059\u308b\u3002\u3069\u3061\u3089\u3092\u4f7f\u3063\u3066\u3082\u826f\u3044\u306e\u3067\u3059\u304c\u3001find\u306e\u65b9\u304c\u304a\u3059\u3059\u3081\u3067\u3059\u3002\u306a\u305c\u306a\u3089\u5168\u3066\u306e\u30bf\u30b0\u3092\u53d6\u5f97\u3057\u305f\u3044\u5834\u5408\u306e\u69cb\u6587\u304c\u300cfind_all()\u300d\u3068\u3044\u3046\u30e1\u30bd\u30c3\u30c9\u304c\u3042\u308b\u306e\u3067\u3001\u4e00\u3064\u3092\u53d6\u5f97\u3059\u308b\u5834\u5408\u306ffind\u3092\u4f7f\u3063\u305f\u65b9\u304c\u308f\u304b\u308a\u3084\u3059\u3044\uff08\u53ef\u8aad\u6027\u304c\u9ad8\u3044\uff09\u305f\u3081\u3067\u3059\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">\u30c6\u30ad\u30b9\u30c8\u3092\u53d6\u5f97<\/h4>\n\n\n\n<pre class=\"wp-block-preformatted\">soup.h2.text \n\n\u307e\u305f\u306f\n\nsoup.h2.get_text()<\/pre>\n\n\n\n<p>h2\u306e\u30c6\u30ad\u30b9\u30c8\u3002\u3053\u308c\u306fget_text()\u3067\u306f\u306a\u304ftext\u3092\u4f7f\u3063\u305f\u65b9\u304c\u826f\u3044\u3067\u3057\u3087\u3046\u3002\u7406\u7531\u306f\u4ee5\u4e0b\u3067\u3059\u3002<\/p>\n\n\n\n<ul>\n<li>.text\u306e\u65b9\u304c\u7c21\u6f54\u306b\u66f8\u3051\u308b\u305f\u3081\u3002<\/li>\n\n\n\n<li>Python\u306e\u8f9e\u66f8\u578b\u306eget()\u304c\u5024\u304c\u8fd4\u3063\u3066\u3053\u306a\u3044\u5834\u5408\u306fNone\u304c\u5e30\u308b\u306e\u306b\u3001get_text()\u3060\u3068\u30a8\u30e9\u30fc\u306b\u306a\u308a\u7d1b\u3089\u308f\u3057\u3044\u305f\u3081\u3002<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\">\u8907\u6570\u306e\u30bf\u30b0\u3092\u53d6\u5f97<\/h3>\n\n\n\n<pre class=\"wp-block-preformatted\">soup.find_all('h2')<\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u4e00\u3064\u76ee\u306e\u30bf\u30b0\u3092\u53d6\u5f97<\/h4>\n\n\n\n<pre class=\"wp-block-preformatted\">soup.find_all('h2')[0]<\/pre>\n\n\n\n<p>\u3053\u308c\u306f\u300csoup.find('h2')\u300d\u3068\u540c\u3058\u306b\u306a\u308a\u307e\u3059\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">\u8907\u6570\u306e\u30bf\u30b0\u3092\u540c\u6642\u306e\u691c\u7d22<\/h4>\n\n\n\n<pre class=\"wp-block-preformatted\">soup.find_all(['h2', 'h3'])<\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u8907\u6570\u306e\u30bf\u30b0\u306e\u4e2d\u304b\u3089\u3001\u30c6\u30ad\u30b9\u30c8\u3092\u53d6\u5f97<\/h4>\n\n\n\n<pre class=\"wp-block-preformatted\">for h2_tag in soup.find_all('h2'):\n    print(h2_tag.text)<\/pre>\n\n\n\n<p>find_all\u306e\u7d50\u679c\u306flist\u5f62\u5f0f\u3067\u5024\u304c\u8fd4\u3063\u3066\u304f\u308b\u306e\u3067\u3001\u4e0a\u8a18\u306e\u3088\u3046\u306bfor\u6587\u3092\u4f7f\u3048\u307e\u3059\u3002\u307e\u305f\u3001enumerate\u306a\u3069\u306e\u69cb\u6587\u3082\u4f7f\u3048\u307e\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">h2_text_list = [tag.text for tag in soup.find_all('h2')]<\/pre>\n\n\n\n<p>\u30ea\u30b9\u30c8\u5185\u5305\u8868\u8a18\u3092\u4f7f\u3063\u3066\u4e0a\u8a18\u306e\u3088\u3046\u306bh2\u306e\u30c6\u30ad\u30b9\u30c8\u306e\u30ea\u30b9\u30c8\u3092\u4f5c\u3063\u305f\u308a\u3059\u308b\u3053\u3068\u3082\u3067\u304d\u307e\u3059\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">\u7279\u5b9a\u306eclass\u3092\u6301\u3064\u30bf\u30b0\u3092\u53d6\u5f97<\/h4>\n\n\n\n<p>\u30bf\u30b0\u6307\u5b9a\u3060\u3068\u4ef6\u6570\u304c\u591a\u3059\u304e\u308b\u30b1\u30fc\u30b9\u304c\u591a\u3044\u306e\u3067\u901a\u5e38\u306fclass\u6307\u5b9a\u306e\u65b9\u304c\u826f\u3044\u3067\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">soup.find_all('span', class_='\u30af\u30e9\u30b9\u540d')<\/pre>\n\n\n\n<p>Python\u306e\u6587\u6cd5\u306eclass\u3068\u533a\u5225\u3059\u308b\u305f\u3081\u306bBeautifulSoup\u3067\u306fclass\u306e\u672b\u5c3e\u306b\u30a2\u30f3\u30c0\u30fc\u30b9\u30b3\u30a2\u3092\u3064\u3051\u308b\u3088\u3046\u3067\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">soup.find_all('span', {'class': '\u30af\u30e9\u30b9\u540d'})<\/pre>\n\n\n\n<p>\u8f9e\u66f8\u3067\u6307\u5b9a\u3059\u308b\u3053\u3068\u3082\u53ef\u80fd\u3067\u3059\u3002<\/p>\n\n\n\n<h5 class=\"wp-block-heading\">\u8907\u6570\u306eclass\u3092\u691c\u7d22<\/h5>\n\n\n\n<pre class=\"wp-block-preformatted\">soup.find_all('span', class_=['\u30af\u30e9\u30b9\u540d1', '\u30af\u30e9\u30b9\u540d2'])<\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">\u691c\u7d22\u7bc4\u56f2\u3092\u72ed\u304f\u3057\u3066\u304b\u3089\u30bf\u30b0\u3092\u53d6\u5f97\u3059\u308b\u624b\u6cd5<\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u691c\u7d22\u7bc4\u56f2\u3092\u72ed\u3081\u308b\u3002<\/h4>\n\n\n\n<pre class=\"wp-block-preformatted\">soup.find('article').find_all(['h2', 'h3'])<\/pre>\n\n\n\n<p>\u8a18\u4e8b\u306e\u5168HTML\u304b\u3089\u6bce\u56de\u5168\u3066\u306e\u30bf\u30b0\u3092\u53d6\u5f97\u3057\u3066\u304f\u308b\u3068\u3044\u3046\u5f62\u3067\u306f\u975e\u5e38\u306b\u5927\u91cf\u306e\u30bf\u30b0\u304c\u6bce\u56de\u30d2\u30c3\u30c8\u3057\u3066\u3057\u307e\u3044\u307e\u3059\u3002<\/p>\n\n\n\n<p>\u305d\u3053\u3067\u3001find\u3067\u307e\u305aarticle\u3068\u3044\u3046\u30bf\u30b0\u69cb\u9020\u3092\u53d6\u5f97\u3057\u3066\u305d\u306e\u4e2d\u304b\u3089\u5fc5\u8981\u306a\u30bf\u30b0\u3092\u53d6\u5f97\u3057\u307e\u3059\u3002<\/p>\n\n\n\n<p>\u8a18\u4e8b\u306e\u30dc\u30ea\u30e5\u30fc\u30e0\u306e\u591a\u3044\u30b5\u30a4\u30c8\u3067\u3042\u308c\u3070\u4f7f\u3048\u308b\u306e\u3067\u975e\u5e38\u306b\u3088\u304f\u4f7f\u3046\u30c6\u30af\u30cb\u30c3\u30af\u306a\u306e\u3067\u662f\u975e\u899a\u3048\u3066\u304a\u304d\u307e\u3057\u3087\u3046\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">\u8a72\u5f53\u3059\u308b\u8981\u7d20\u3092\u524a\u9664\u3059\u308b\u3002<\/h4>\n\n\n\n<pre class=\"wp-block-preformatted\">soup.find('h2', class_='\u30af\u30e9\u30b9\u540d').extract()\nsoup.find_all('h2'')<\/pre>\n\n\n\n<p>\u6307\u5b9a\u3057\u305fclass\u540d\u306e\u8981\u7d20\u304c\u6d88\u3048\u307e\u3059\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">\u3069\u3061\u3089\u3092\u9078\u3079\u3070\u826f\u3044\u306e\u304b\uff1f<\/h4>\n\n\n\n<p>\u300c\u691c\u7d22\u7bc4\u56f2\u3092\u72ed\u304f\u3059\u308b\u65b9\u6cd5\u300d\u3092\u9078\u3073\u307e\u3057\u3087\u3046\u3002<span style=\"color: #ff0000;\"><strong>\u300c\u8a72\u5f53\u3059\u308b\u8981\u7d20\u3092\u524a\u9664\u3059\u308b\u300d\u306e\u306f\u7834\u58ca\u7684\u306a\u65b9\u6cd5\u306b\u306a\u3063\u3066\u3057\u307e\u3046<\/strong><\/span>\u305f\u3081\u3067\u3059\u3002\u4eee\u306b\u6d88\u3057\u3066\u3057\u307e\u3063\u305f\u60c5\u5831\u306e\u4e2d\u304b\u3089\u5225\u306b\u6b32\u3057\u3044\u60c5\u5831\u304c\u3042\u3063\u305f\u5834\u5408\u306b\u3082\u3046\u4e00\u5ea6Requests\u3092\u4f7f\u3063\u3066\u30ea\u30af\u30a8\u30b9\u30c8\u3057\u76f4\u3059\u5fc5\u8981\u304c\u3042\u308b\u306e\u3067\u30ea\u30af\u30a8\u30b9\u30c8\u6570\u304c\u5897\u3048\u3066\u3057\u307e\u3044\u304a\u76f8\u624b\u306e\u30b5\u30fc\u30d0\u30fc\u306b\u4f59\u8a08\u306b\u8ca0\u8377\u3092\u304b\u3051\u308b\u3053\u3068\u306b\u306a\u3063\u3066\u3057\u307e\u3044\u307e\u3059\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">XPath\u3092\u4f7f\u3046\u3002<\/h2>\n\n\n\n<p>lxml\u3068\u3044\u3046\u30d1\u30c3\u30b1\u30fc\u30b8\u3092\u4f7f\u3046\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>pip install lxml<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">\u5b9f\u88c5<\/h3>\n\n\n\n<p>\u4f8b\u3048\u3070\u3001h2\u306ehref\u5c5e\u6027\u306e\u5024\u3092\u53d6\u5f97\u3059\u308b\u30b5\u30f3\u30d7\u30eb\u3068\u3057\u3066\u306f\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u306a\u308a\u307e\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>soup = BeautifulSoup(response.text, features=\"lxml\")\nlxml_coverted_data = html.fromstring(str(soup))\ndata = lxml_coverted_data.xpath(\"\/\/h2\/a\/@href\")\nprint(data)<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">\u30c6\u30fc\u30d6\u30eb\u304b\u3089\u60c5\u5831\u3092\u62bd\u51fa\u3059\u308b\u4f8b<\/h2>\n\n\n\n<p>\u5404\u60c5\u5831\u3092\u8f9e\u66f8\u306b\u767b\u9332\u3057\u3066\u305d\u308c\u3092list\u306b\u683c\u7d0d\u3057\u3066\u3044\u304d\u307e\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">list = [] \ntable = soup.find('table', class_='table\u306eclass\u540d') \ntr_tags = table.find_all('tr', class_='tr\u306eclass\u540d')\n    \nfor tr_tag in tr_tags:        \n    \n    # \u6b32\u3057\u3044\u60c5\u5831\u3060\u3051\u629c\u304d\u51fa\u3059\u3002\n    madori, yatin, kyori = tr_tag.find_all('td')[2:5]\n    \n    # \u53d6\u5f97\u3057\u305f\u3059\u3079\u3066\u306e\u60c5\u5831\u3092\u8f9e\u66f8\u306b\u683c\u7d0d\u3059\u308b\n    d = {\n        'madori': madori.text, # \u4e2d\u8eab\u306e\u60c5\u5831\u3060\u3051\u53d6\u5f97\u3059\u308b\u3002\n        'yatin': yatin.text, # \u4e2d\u8eab\u306e\u60c5\u5831\u3060\u3051\u53d6\u5f97\u3059\u308b\u3002\n        'kyori':kyori.text, # \u4e2d\u8eab\u306e\u60c5\u5831\u3060\u3051\u53d6\u5f97\u3059\u308b\u3002\n    }\n    \n    # \u53d6\u5f97\u3057\u305f\u8f9e\u66f8\u3092d_list\u306b\u683c\u7d0d\u3059\u308b\n    list.append(d)<\/pre>\n\n\n\n<p>\u306a\u305c[{\u8f9e\u66f81},{\u8f9e\u66f82}]\u306e\u3088\u3046\u306a\u30c7\u30fc\u30bf\u69cb\u9020\u306b\u3057\u3066\u3044\u308b\u304b\u3068\u3044\u3048\u3070\u3001\u4eca\u5f8c\u30c7\u30fc\u30bf\u3092\u4f55\u3089\u304b\u306e\u8868\u5f62\u5f0f\u3067\u4fdd\u5b58\u3057\u3066\u3044\u304f\u3053\u3068\u306b\u306a\u308b\u304b\u3068\u601d\u3044\u307e\u3059\u304c\u3001\u305d\u306e\u969b\u306b\u51e6\u7406\u304c\u975e\u5e38\u306b\u7c21\u5358\u306b\u306a\u308b\u305f\u3081\u3067\u3059\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb conda install beautifulsoup4 \u307e\u305f\u306f pip install beautifulsoup4 \u30a4\u30f3\u30dd\u30fc\u30c8 from bs4 import BeautifulSoup import  [&hellip;]","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[47],"tags":[],"_links":{"self":[{"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/posts\/20845"}],"collection":[{"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=20845"}],"version-history":[{"count":18,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/posts\/20845\/revisions"}],"predecessor-version":[{"id":21803,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/posts\/20845\/revisions\/21803"}],"wp:attachment":[{"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=20845"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=20845"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=20845"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}