{"id":21211,"date":"2023-06-10T23:08:58","date_gmt":"2023-06-10T14:08:58","guid":{"rendered":"http:\/\/www.code-magagine.com\/?p=21211"},"modified":"2023-07-14T23:00:32","modified_gmt":"2023-07-14T14:00:32","slug":"%e3%80%90python%e3%80%91%e3%80%8cscrapy-selenium%e3%80%8d%e3%82%92%e6%89%b1%e3%81%86%e3%80%82","status":"publish","type":"post","link":"http:\/\/www.code-magagine.com\/?p=21211","title":{"rendered":"\u3010Python\u3011\u300cscrapy-selenium\u300d\u3092\u6271\u3046\u3002"},"content":{"rendered":"\n<h2 class=\"wp-block-heading\">\u4e8b\u524d\u6e96\u5099<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">1.Scrapy\u306e\u30d7\u30ed\u30b8\u30a7\u30af\u30c8\u3092\u4f5c\u6210\u3059\u308b\u3002<\/h3>\n\n\n\n<pre class=\"wp-block-preformatted\">scrapy startproject \u5bfe\u8c61\u30b5\u30a4\u30c8\u540d\ncd \u5bfe\u8c61\u30b5\u30a4\u30c8\u540d\nscrapy genspider \u4efb\u610f\u306espider\u540d www.xxx.com(\u30b5\u30a4\u30c8URL)<\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">2.projects\/\u30d7\u30ed\u30b8\u30a7\u30af\u30c8\u914d\u4e0b\u306bchromedriver\u3092\u914d\u7f6e\u3057\u307e\u3059\u3002<\/h3>\n\n\n\n<p>chromedriver\u306f\u81ea\u5206\u306e\u304a\u4f7f\u3044\u306eChrome\u306e\u30d0\u30fc\u30b8\u30e7\u30f3\u3092\u8abf\u3079\u3066\u691c\u7d22\u3059\u308c\u3070\u5165\u624b\u53ef\u80fd\u3067\u3059\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">3.scrapy-selenium\u306e\u8a2d\u5b9a\u3092\u3059\u308b\u3002<\/h3>\n\n\n\n<p>\u4ee5\u4e0b\u306e\u516c\u5f0f\u30da\u30fc\u30b8\u306b\u884c\u304d\u307e\u3059\u3002Configuration\u306e\u3068\u3053\u308d\u306b\u8a18\u8f09\u304c\u3042\u308b\u300c1\u300d\u3068\u300c2\u300d\u306e\u8a18\u8ff0\u3092settings.py\u306b\u8cbc\u308a\u4ed8\u3051\u307e\u3059\u3002<\/p>\n\n\n\n<figure class=\"wp-block-embed\"><div class=\"wp-block-embed__wrapper\">\nhttps:\/\/github.com\/clemfromspace\/scrapy-selenium\n<\/div><\/figure>\n\n\n\n<pre class=\"wp-block-preformatted\">\n\nSELENIUM_DRIVER_NAME = 'chrome'\nSELENIUM_DRIVER_EXECUTABLE_PATH = r'\u81ea\u5206\u306ePC\u306echromedriver\u306e\u30d1\u30b9'\nSELENIUM_DRIVER_ARGUMENTS=['-headless']  # '--headless' if using chrome instead of firefox\n\nDOWNLOADER_MIDDLEWARES = {\n    'scrapy_selenium.SeleniumMiddleware': 800\n}<\/pre>\n\n\n\n<p>\u306a\u304a\u3001firefox\u306b\u306a\u3063\u3066\u3044\u307e\u3059\u304c\u3001chrome\u304c\u57fa\u672c\u3060\u3068\u601d\u3044\u307e\u3059\u306e\u3067SELENIUM_DRIVER_NAME\u306f\u300cchrome\u300d\u3068SELENIUM_DRIVER_EXECUTABLE_PATH\u306f\u300cr'\u81ea\u5206\u306ePC\u306echromedriver\u306e\u30d1\u30b9'\u300d\u306b\u305d\u308c\u305e\u308c\u66f8\u304d\u63db\u3048\u307e\u3059\u3002\uff08which('chromedriver')\u3068\u3044\u3046\u6307\u5b9a\u3067\u3082\u74b0\u5883\u306b\u3088\u3063\u3066\u306f\u3044\u3051\u308b\u3063\u307d\u3044\u3067\u3059\u304c\u81ea\u5206\u306e\u74b0\u5883\u3060\u3068\u76f4\u63a5\u6307\u5b9a\u3057\u306a\u3044\u3068\u52d5\u304d\u307e\u305b\u3093\u3067\u3057\u305f\u3002\uff09<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">\u305d\u306e\u4ed6\u8a2d\u5b9a<\/h3>\n\n\n\n<p>scrapy-selenium\u3092\u4f7f\u3046\u5834\u5408\u3067\u3082\u6700\u4f4e\u9650\u306e\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u30de\u30ca\u30fc\u3068\u3057\u3066\u3053\u306e\u8fba\u306e\u8a2d\u5b9a\u306f\u3057\u3066\u304a\u304f\u3088\u3046\u306b\u3057\u307e\u3057\u3087\u3046\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>DOWNLOAD_DELAY = 5 # \u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u9593\u9694\u3092\u79d2\u6570\u3067\u6307\u5b9a\nFEED_EXPORT_ENCODING = 'utf-8' # \u6587\u5b57\u5316\u3051\u3057\u306a\u3044\u3088\u3046\u306b\u8a2d\u5b9a\u3057\u3066\u304a\u304f\nROBOTSTXT_OBEY = True # robots.txt\u304c\u3042\u308b\u5834\u5408\u306f\u305d\u308c\u306b\u5f93\u3046\u8a2d\u5b9a<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">\u691c\u7d22\u30dc\u30c3\u30af\u30b9\u306b\u5165\u529b\u3059\u308b\u30b5\u30f3\u30d7\u30eb\u5b9f\u88c5<\/h2>\n\n\n\n<pre class=\"wp-block-preformatted\">import scrapy\nfrom scrapy_selenium import SeleniumRequest\nfrom time import sleep\nfrom selenium.webdriver.common.keys import Keys\nfrom scrapy.selector import Selector\n\nclass XXXPythonSpider(scrapy.Spider):\n    name = \"spider\u540d\"\n\n    # \u6700\u521d\u306e\u30ea\u30af\u30a8\u30b9\u30c8\u306fstart_requests\u3067\u9001\u308b\u3002\n    def start_requests(self):\n        yield SeleniumRequest(\n            url='\u30b5\u30a4\u30c8\u306eURL',\n            wait_time=3,\n            callback=self.parse\n        )\n\n    # spider\u3067\u306f\u30ec\u30b9\u30dd\u30f3\u30b9\u306fparse\u30e1\u30bd\u30c3\u30c9\u3067\u53d7\u3051\u308b\u3002\n    def parse(self, response):\n        driver = response.meta['driver']\n        search_bar = driver.find_element_by_xpath('\u691c\u7d22\u30dc\u30c3\u30af\u30b9\u306eXpath')\n        search_bar.send_keys('python')\n        sleep(1)\n        driver.save_screenshot('xxx.png')\n        search_bar.send_keys(Keys.ENTER)\n        sleep(3)\n        w = driver.execute_script('return document.body.scrollWidth')\n        h = driver.execute_script('return document.body.scrollHeight')\n        driver.set_window_size(w,h)\n        driver.save_screenshot('yyy.png')\n        html = driver.page_source\n        sel = Selector(text=html)\n        for elem in sel.xpath('\u691c\u7d22\u7d50\u679c\u306eXPath'):\n\t\t    yield{\n\t\t        'title': elem.xpath('\u30bf\u30a4\u30c8\u30eb\u306eXPath').get(),\n\t\t        'URL': elem.xpath('URL\u306eXPath').get()\n\t\t    }\n<\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">start_requests<\/h3>\n\n\n\n<p>scrapy-selenium\u3092\u4f7f\u3046\u5834\u5408\u306fselenium\u306e\u30ea\u30af\u30a8\u30b9\u30c8\u3068\u3057\u3066\u51e6\u7406\u3059\u308b\u5fc5\u8981\u304c\u3042\u308b\u306e\u3067\u3053\u308c\u3092\u30aa\u30fc\u30d0\u30fc\u30e9\u30a4\u30c9\u3057\u307e\u3059\u3002start_urls\u5c5e\u6027\u3067\u6307\u5b9a\u3055\u308c\u305fURL\u306b\u5bfe\u3057\u3066\u30ea\u30af\u30a8\u30b9\u30c8\u3092\u9001\u4fe1\u3057\u307e\u3059\u3002\uff08\u305f\u3060\u3001\u672c\u30b5\u30f3\u30d7\u30eb\u3067\u306f\u5f8c\u7d9a\u306eSeleniumRequest\u306b\u3066url\u3092\u5225\u9014\u6307\u5b9a\u3057\u3066\u3044\u308b\u306e\u3067\u4e0d\u8981\u306b\u306a\u308a\u307e\u3059\u3002\uff09<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">wait_time<\/h4>\n\n\n\n<p>\u5f85\u3061\u6642\u9593\u3067\u3059\u3002\u4e0a\u8a18\u30b5\u30f3\u30d7\u30eb\u3067\u306f3\u79d2\u3092\u6307\u5b9a\u3057\u3066\u3044\u307e\u3059\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">response<\/h3>\n\n\n\n<p>\u30c7\u30fc\u30bf\u578b\u306f\u300cscrapy.http.response.html.HtmlResponse\u300d\u30af\u30e9\u30b9\u306b\u306a\u308a\u307e\u3059\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">driver = response.meta('driver')<\/h3>\n\n\n\n<p>\u30ea\u30af\u30a8\u30b9\u30c8\u3057\u305fdriver\u306e\u60c5\u5831\u306f\u30ec\u30b9\u30dd\u30f3\u30b9\u306emeta\u30c7\u30fc\u30bf\u306e\u4e2d\u306b\u683c\u7d0d\u3055\u308c\u3066\u3044\u308b\u306e\u3067\u4e0a\u8a18\u306e\u3088\u3046\u306b\u53d6\u308a\u51fa\u3059\u3053\u3068\u304c\u53ef\u80fd\u3067\u3059\u3002\u3053\u306e\u30c9\u30e9\u30a4\u30d0\u30fc\u306e\u60c5\u5831\u3092\u3082\u3068\u306bxpath\u3084css\u3067\u60c5\u5831\u3092\u53d6\u5f97\u3057\u305f\u308a\u3057\u307e\u3059\u3002<\/p>\n\n\n\n<p>\u3053\u3053\u3067\u53d6\u5f97\u3057\u305fdriver\u306e\u30c7\u30fc\u30bf\u578b\u306f\u300cselenium.webdriver.chrome.webdriver.WebDriver\u300d\u3068\u3044\u3046class\u306b\u306a\u3063\u3066\u3044\u307e\u3059\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">driver.save_screenshot('xxx.png')<\/h3>\n\n\n\n<p>scrapy-selenium\u306f\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u304c\u57fa\u672c\u306a\u306e\u3067\u51e6\u7406\u306e\u6700\u5f8c\u3067\u30b9\u30af\u30ea\u30fc\u30f3\u30b7\u30e7\u30c3\u30c8\u3092\u53d6\u5f97\u3059\u308b\u3088\u3046\u306b\u3059\u308b\u65b9\u304c\u7121\u96e3\u3067\u3057\u3087\u3046\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">search_bar.send_keys(Keys.ENTER)<\/h3>\n\n\n\n<p>Enter\u30ad\u30fc\u3092\u62bc\u3059\u30a2\u30af\u30b7\u30e7\u30f3\u3067\u3059\u3002\uff08click\u3084submit\u306a\u3069\u306e\u30dc\u30bf\u30f3\u3092\u62bc\u3059\u30a2\u30af\u30b7\u30e7\u30f3\u3067\u3082\u826f\u3044\u304b\u3082\u3057\u308c\u307e\u305b\u3093\uff09<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">driver.set_window_size(w,h)<\/h3>\n\n\n\n<p>execute_script\u3067\u5b9f\u884c\u3057\u305fJavaScript\u3067\u753b\u9762\u30b5\u30a4\u30ba\u3092\u53d6\u5f97\u3057\u305f\u7d50\u679c\u3092\u8a2d\u5b9a\u3057\u3066\u3044\u307e\u3059\u3002\u901a\u5e38\u3060\u3068\u753b\u9762\u306e\u30b9\u30af\u30ea\u30fc\u30f3\u30b7\u30e7\u30c3\u30c8\u3092\u53d6\u5f97\u3057\u3066\u3082\u4e00\u90e8\u3057\u304b\u53d6\u5f97\u3067\u304d\u306a\u3044\u306e\u3067\u3059\u304c\u3001\u3053\u3046\u3059\u308b\u3053\u3068\u3067\u753b\u9762\u5168\u4f53\u306e\u30b9\u30af\u30ea\u30fc\u30f3\u30b7\u30e7\u30c3\u30c8\u3092\u53d6\u5f97\u3067\u304d\u307e\u3059\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">html = driver.page_source\u3001sel = Selector(text=html)<\/h3>\n\n\n\n<p>driver\u306f\u6700\u521d\u306b\u53d6\u5f97\u3057\u305fHTML\u3057\u304b\u6301\u305f\u306a\u3044\u306e\u3067\u305d\u306e\u5f8c\u691c\u7d22\u30a2\u30af\u30b7\u30e7\u30f3\u3092\u884c\u3063\u305f\u5f8c\u306eHTML\u306b\u95a2\u3057\u3066\u306f\u5225\u9014\u30a2\u30af\u30b7\u30e7\u30f3\u5f8c\u306b\u53d6\u5f97\u3057\u3066\u3042\u3052\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002\u4e0a\u8a18\u306e\u3088\u3046\u306bhtml\u3092\u53d6\u5f97\u3057\u305f\u5f8c\u306bSelector\u30aa\u30d6\u30b8\u30a7\u30af\u30c8\u306b\u5909\u63db\u3057\u3066\u3042\u3052\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u6b21\u306e\u30da\u30fc\u30b8\u3092\u53d6\u5f97\u3059\u308b<\/h2>\n\n\n\n<p>\u4e0a\u306e\u4f8b\u306e\u3088\u3046\u306b\u4e00\u89a7\u753b\u9762\u306e\u3088\u3046\u306b\u8907\u6570\u30da\u30fc\u30b8\u3042\u308b\u5834\u5408\u306e\u60c5\u5831\u53d6\u5f97\u51e6\u7406\u306f\u4ee5\u4e0b\u306e\u3088\u3046\u306b\u306a\u308a\u307e\u3059\u3002\u90fd\u5ea6SeleniumRequest\u3092\u3057\u3066\u3042\u3052\u3066\u3001\u6b21\u306e\u30da\u30fc\u30b8\u304c\u3042\u308b\u5834\u5408\u306e\u307f\u518d\u5ea6SeleniumRequest\u3092\u3057\u3066\u3042\u3052\u308b\u3088\u3046\u306b\u3057\u307e\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">    def parse(self, response):\n        driver = response.meta['driver']\n        yield SeleniumRequest(\n            url=driver.current_url,\n            wait_time=3,\n            callback=self.parse_next\n        )\n\n    def parse_next(self, response):\n        for elem in sel.xpath('\u691c\u7d22\u7d50\u679c\u306eXPath'):\n\t\t    yield{\n\t\t        'title': elem.xpath('\u30bf\u30a4\u30c8\u30eb\u306eXPath').get(),\n\t\t        'URL': elem.xpath('URL\u306eXPath').get()\n            }\n        \n        next_page = response.xpath('\u6b21\u3078\u30dc\u30bf\u30f3\u306eXPath').get()\n        if next_page:\n            next_url = response.urljoin(next_page)\n            yield SeleniumRequest(\n                url=next_url,\n                wait_time=3,\n                callback=self.parse_next\n            )<\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">driver.current_url<\/h3>\n\n\n\n<p>\u73fe\u5728\u306eURL\u3092\u518d\u5ea6\u30ea\u30af\u30a8\u30b9\u30c8\u3057\u3066\u3044\u307e\u3059\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">next_url = response.urljoin(next_page)<\/h3>\n\n\n\n<p>SeleniumRequest\u306f\u7d76\u5bfe\u30d1\u30b9\u6307\u5b9a\u304c\u5fc5\u8981\u306b\u306a\u308b\u306e\u3067\u3053\u306e\u3088\u3046\u306b\u5909\u63db\u3057\u3066\u3042\u3052\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"\u4e8b\u524d\u6e96\u5099 1.Scrapy\u306e\u30d7\u30ed\u30b8\u30a7\u30af\u30c8\u3092\u4f5c\u6210\u3059\u308b\u3002 scrapy startproject \u5bfe\u8c61\u30b5\u30a4\u30c8\u540d cd \u5bfe\u8c61\u30b5\u30a4\u30c8\u540d scrapy genspider \u4efb\u610f\u306espider\u540d www.xxx.com(\u30b5\u30a4\u30c8URL [&hellip;]","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[47],"tags":[],"_links":{"self":[{"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/posts\/21211"}],"collection":[{"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=21211"}],"version-history":[{"count":21,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/posts\/21211\/revisions"}],"predecessor-version":[{"id":21975,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=\/wp\/v2\/posts\/21211\/revisions\/21975"}],"wp:attachment":[{"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=21211"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=21211"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.code-magagine.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=21211"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}