使用中間件
直接設(shè)置Request類的meta參數(shù)
要進行下面兩步操作
在文件 settings.py 中激活代理中間件ProxyMiddleware
在文件 middlewares.py 中實現(xiàn)類ProxyMiddleware
# settings.pyDOWNLOADER_MIDDLEWARES = {'project_name.middlewares.ProxyMiddleware': 100, # 注意修改 project_name'scrapy.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,}
說明:
數(shù)字100, 110表示中間件先被調(diào)用的次序。數(shù)字越小,越先被調(diào)用。
官網(wǎng)文檔:
The integer values you assign to classes in this setting determine the order in which they run: items go through from lower valued to higher valued classes. It’s customary to define these numbers in the 0-1000 range.
代理不斷變換
這里利用網(wǎng)上API 直接get過來。(需要一個APIKEY,免費注冊一個賬號就有了。這個APIKEY是我自己的,不保證一直有效!)
也可以從網(wǎng)上現(xiàn)抓。
還可以從本地文件讀取
# middlewares.pyimport requestsclass ProxyMiddleware(object):def process_request(self, request, spider):APIKEY = 'f95f08afc952c034cc2ff9c5548d51be'url = 'https://www.proxicity.io/api/v1/{}/proxy'.format(APIKEY) # 在線API接口r = requests.get(url)request.meta['proxy'] = r.json()['curl'] # 協(xié)議://IP地址:端口(如 http://5.39.85.100:30059)return request
import random# 事先準備的代理池proxy_pool = ['http://proxy_ip1:port', 'http://proxy_ip2:port', ..., 'http://proxy_ipn:port']class MySpider(BaseSpider):name = "my_spider"allowed_domains = ["example.com"]start_urls = ['http://www.example.com/articals/',]def start_requests(self):for url in self.start_urls:proxy_addr = random.choice(proxy_pool) # 隨機選一個yield scrapy.Request(url, callback=self.parse, meta={'proxy': proxy_addr}) # 通過meta參數(shù)添加代理def parse(self, response):# doing parse
1.閱讀官網(wǎng)文檔對Request類的描述,我們可以發(fā)現(xiàn)除了設(shè)置proxy,還可以設(shè)置method, headers, cookies, encoding等等:
class scrapy.http.Request(url[, callback, method='GET', headers, body, cookies, meta, encoding='utf-8', priority=0, dont_filter=False, errback])
2.官網(wǎng)文檔對Request.meta參數(shù)可以設(shè)置的詳細列表:
dont_redirect
dont_retry
handle_httpstatus_list
handle_httpstatus_all
dont_merge_cookies (see cookies parameter of Request constructor)
cookiejar
dont_cache
redirect_urls
bindaddress
dont_obey_robotstxt
download_timeout
download_maxsize
proxy
如隨機設(shè)置請求頭和代理:
# my_spider.pyimport random# 事先收集準備的代理池proxy_pool = ['http://proxy_ip1:port','http://proxy_ip2:port',...,'http://proxy_ipn:port']# 事先收集準備的 headersheaders_pool = [{'User-Agent': 'Mozzila 1.0'},{'User-Agent': 'Mozzila 2.0'},{'User-Agent': 'Mozzila 3.0'},{'User-Agent': 'Mozzila 4.0'},{'User-Agent': 'Chrome 1.0'},{'User-Agent': 'Chrome 2.0'},{'User-Agent': 'Chrome 3.0'},{'User-Agent': 'Chrome 4.0'},{'User-Agent': 'IE 1.0'},{'User-Agent': 'IE 2.0'},{'User-Agent': 'IE 3.0'},{'User-Agent': 'IE 4.0'},]class MySpider(BaseSpider):name = "my_spider"allowed_domains = ["example.com"]start_urls = ['http://www.example.com/articals/',]def start_requests(self):for url in self.start_urls:headers = random.choice(headers_pool) # 隨機選一個headersproxy_addr = random.choice(proxy_pool) # 隨機選一個代理yield scrapy.Request(url, callback=self.parse, headers=headers, meta={'proxy': proxy_addr})def parse(self, response):# doing parse
scrapy接入IP代理池(代碼部分):https://blog.csdn.net/xudailong_blog/article/details/80153387
class HttpProxymiddleware(object):# 一些異常情況匯總EXCEPTIONS_TO_CHANGE = (defer.TimeoutError, TimeoutError, ConnectionRefusedError, ConnectError, ConnectionLost,TCPTimedOutError, ConnectionDone)def __init__(self):# 鏈接數(shù)據(jù)庫 decode_responses設(shè)置取出的編碼為strself.redis = redis.from_url('redis://:你的密碼@localhost:6379/0',decode_responses=True)passdef process_request(self, request, spider):#拿出全部key,隨機選取一個鍵值對keys = self.rds.hkeys("xila_hash")key = random.choice(keys)#用eval函數(shù)轉(zhuǎn)換為dictproxy = eval(self.rds.hget("xila_hash",key))logger.warning("-----------------"+str(proxy)+"試用中------------------------")#將代理ip 和 key存入materequest.meta["proxy"] = proxy["ip"]request.meta["accountText"] = keydef process_response(self, request, response, spider):http_status = response.status#根據(jù)response的狀態(tài)判斷 ,200的話ip的times +1重新寫入數(shù)據(jù)庫,返回response到下一環(huán)節(jié)if http_status == 200:key = request.meta["accountText"]proxy = eval(self.rds.hget("xila_hash",key))proxy["times"] = proxy["times"] + 1self.rds.hset("xila_hash",key,proxy)return response#403有可能是因為user-agent不可用引起,和代理ip無關(guān),返回請求即可elif http_status == 403:logging.warning("#########################403重新請求中############################")return request.replace(dont_filter=True)#其他情況姑且被判定ip不可用,times小于10的,刪掉,大于等于10的暫時保留else:ip = request.meta["proxy"]key = request.meta["accountText"]proxy = eval(self.rds.hget("xila_hash", key))if proxy["times"] < 10:self.rds.hdel("xila_hash",key)logging.warning("#################" + ip + "不可用,已經(jīng)刪除########################")return request.replace(dont_filter=True)def process_exception(self, request, exception, spider):#其他一些timeout之類異常判斷后的處理,ip不可用刪除即可if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \and request.meta.get('proxy', False):key = request.meta["accountText"]print("+++++++++++++++++++++++++{}不可用+++將被刪除++++++++++++++++++++++++".format(key))proxy = eval(self.rds.hget("xila_hash", key))if proxy["times"] < 10:self.rds.hdel("xila_hash", key)logger.debug("Proxy {}鏈接出錯{}.".format(request.meta['proxy'], exception))return request.replace(dont_filter=True)
實際爬蟲過程中如果請求過于頻繁,通常會被臨時重定向到登錄頁面即302,甚至是提示禁止訪問即403,因此可以對這些響應執(zhí)行一次代理請求:
(1) 參考原生 redirect.py 模塊,滿足 dont_redirect 或 handle_httpstatus_list 等條件時,直接傳遞 response
(2) 不滿足條件(1),如果響應狀態(tài)碼為 302 或 403,使用代理重新發(fā)起請求
(3) 使用代理后,如果響應狀態(tài)碼仍為 302 或 403,直接丟棄
保存至 /site-packages/my_middlewares.py
from w3lib.url import safe_url_stringfrom six.moves.urllib.parse import urljoinfrom scrapy.exceptions import IgnoreRequestclass MyAutoProxyDownloaderMiddleware(object):def __init__(self, settings):self.proxy_status = settings.get(‘PROXY_STATUS‘, [302, 403])# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html?highlight=proxy#module-scrapy.downloadermiddlewares.httpproxyself.proxy_config = settings.get(‘PROXY_CONFIG‘, ‘http://username:password@some_proxy_server:port‘)@classmethoddef from_crawler(cls, crawler):return cls(settings = crawler.settings)# See /site-packages/scrapy/downloadermiddlewares/redirect.pydef process_response(self, request, response, spider):if (request.meta.get(‘dont_redirect‘, False) orresponse.status in getattr(spider, ‘handle_httpstatus_list‘, []) orresponse.status in request.meta.get(‘handle_httpstatus_list‘, []) orrequest.meta.get(‘handle_httpstatus_all‘, False)):return responseif response.status in self.proxy_status:if ‘Location‘ in response.headers:location = safe_url_string(response.headers[‘location‘])redirected_url = urljoin(request.url, location)else:redirected_url = ‘‘# AutoProxy for first timeif not request.meta.get(‘a(chǎn)uto_proxy‘):request.meta.update({‘a(chǎn)uto_proxy‘: True, ‘proxy‘: self.proxy_config})new_request = request.replace(meta=request.meta, dont_filter=True)new_request.priority = request.priority + 2spider.log(‘Will AutoProxy for <{} {}> {}‘.format(response.status, request.url, redirected_url))return new_request# IgnoreRequest for second timeelse:spider.logger.warn(‘Ignoring response <{} {}>: HTTP status code still in {} after AutoProxy‘.format(response.status, request.url, self.proxy_status))raise IgnoreRequestreturn response
(1) 項目 settings.py 添加代碼,注意必須在默認的 RedirectMiddleware 和 HttpProxyMiddleware 之間。
# Enable or disable downloader middlewares# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.htmlDOWNLOADER_MIDDLEWARES = {# ‘scrapy.downloadermiddlewares.redirect.RedirectMiddleware‘: 600,‘my_middlewares.MyAutoProxyDownloaderMiddleware‘: 601,# ‘scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware‘: 750,}PROXY_STATUS = [302, 403]PROXY_CONFIG = ‘http://username:password@some_proxy_server:port‘
2018-07-18 18:42:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://httpbin.org/> (referer: None)2018-07-18 18:42:38 [test] DEBUG: Will AutoProxy for <302 http://httpbin.org/status/302> http://httpbin.org/redirect/12018-07-18 18:42:43 [test] DEBUG: Will AutoProxy for <403 https://httpbin.org/status/403>2018-07-18 18:42:51 [test] WARNING: Ignoring response <302 http://httpbin.org/status/302>: HTTP status code still in [302, 403] after AutoProxy2018-07-18 18:42:52 [test] WARNING: Ignoring response <403 https://httpbin.org/status/403>: HTTP status code still in [302, 403] after AutoProxy
代理服務器 log:
squid [18/Jul/2018:18:42:53 +0800] "GET http://httpbin.org/status/302 HTTP/1.1" 302 310 "-" "Mozilla/5.0" TCP_MISS:HIER_DIRECTsquid [18/Jul/2018:18:42:54 +0800] "CONNECT httpbin.org:443 HTTP/1.1" 200 3560 "-" "-" TCP_TUNNEL:HIER_DIRECT
在學習Python爬蟲的時候,經(jīng)常會遇見所要爬取的網(wǎng)站采取了反爬取技術(shù),高強度、高效率地爬取網(wǎng)頁信息常常會給網(wǎng)站服務器帶來巨大壓力,所以同一個IP反復爬取同一個網(wǎng)頁,就很可能被封,這里講述一個爬蟲技巧,設(shè)置代理IP。
安裝requests庫
安裝bs4庫
安裝lxml庫
IP地址取自國內(nèi)髙匿代理IP網(wǎng)站:http://www.xicidaili.com/nn/
僅僅爬取首頁IP地址就足夠一般使用
from bs4 import BeautifulSoupimport requestsimport randomdef get_ip_list(url, headers):web_data = requests.get(url, headers=headers)soup = BeautifulSoup(web_data.text, 'lxml')ips = soup.find_all('tr')ip_list = []for i in range(1, len(ips)):ip_info = ips[i]tds = ip_info.find_all('td')ip_list.append(tds[1].text + ':' + tds[2].text)return ip_listdef get_random_ip(ip_list):proxy_list = []for ip in ip_list:proxy_list.append('http://' + ip)proxy_ip = random.choice(proxy_list)proxies = {'http': proxy_ip}return proxiesif __name__ == '__main__':url = 'http://www.xicidaili.com/nn/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}ip_list = get_ip_list(url, headers=headers)proxies = get_random_ip(ip_list)print(proxies)
數(shù)get_ip_list(url, headers)傳入url和headers,最后返回一個IP列表,列表的元素類似42.84.226.65:8888格式,這個列表包括國內(nèi)髙匿代理IP網(wǎng)站首頁所有IP地址和端口。
函數(shù)get_random_ip(ip_list)傳入第一個函數(shù)得到的列表,返回一個隨機的proxies,這個proxies可以傳入到requests的get方法中,這樣就可以做到每次運行都使用不同的IP訪問被爬取的網(wǎng)站,有效地避免了真實IP被封的風險。proxies的格式是一個字典:{‘http’: ‘http://42.84.226.65:8888‘}。
運行上面的代碼會得到一個隨機的proxies,把它直接傳入requests的get方法中即可。
web_data = requests.get(url, headers=headers, proxies=proxies)
繼續(xù)老套路,這兩天我爬取了豬八戒上的一些數(shù)據(jù) 網(wǎng)址是:http://task.zbj.com/t-ppsj/p1s5.html
,可能是由于爬取的數(shù)據(jù)量有點多吧,結(jié)果我的IP被封了,需要自己手動來驗證解封ip,但這顯然阻止了我爬取更多的數(shù)據(jù)了。
下面是我寫的爬取豬八戒的被封IP的代碼
# coding=utf-8import requestsfrom lxml import etreedef getUrl():for i in range(33):url = 'http://task.zbj.com/t-ppsj/p{}s5.html'.format(i+1)spiderPage(url)def spiderPage(url):if url is None:return NonehtmlText = requests.get(url).textselector = etree.HTML(htmlText)tds = selector.xpath('//*[@class="tab-switch tab-progress"]/table/tr')try:for td in tds:price = td.xpath('./td/p/em/text()')href = td.xpath('./td/p/a/@href')title = td.xpath('./td/p/a/text()')subTitle = td.xpath('./td/p/text()')deadline = td.xpath('./td/span/text()')price = price[0] if len(price)>0 else '' # python的三目運算 :為真時的結(jié)果 if 判定條件 else 為假時的結(jié)果title = title[0] if len(title)>0 else ''href = href[0] if len(href)>0 else ''subTitle = subTitle[0] if len(subTitle)>0 else ''deadline = deadline[0] if len(deadline)>0 else ''print price,title,href,subTitle,deadlineprint '---------------------------------------------------------------------------------------'spiderDetail(href)except:print '出錯'def spiderDetail(url):if url is None:return Nonetry:htmlText = requests.get(url).textselector = etree.HTML(htmlText)aboutHref = selector.xpath('//*[@id="utopia_widget_10"]/div[1]/div/div/div/p[1]/a/@href')price = selector.xpath('//*[@id="utopia_widget_10"]/div[1]/div/div/div/p[1]/text()')title = selector.xpath('//*[@id="utopia_widget_10"]/div[1]/div/div/h2/text()')contentDetail = selector.xpath('//*[@id="utopia_widget_10"]/div[2]/div/div[1]/div[1]/text()')publishDate = selector.xpath('//*[@id="utopia_widget_10"]/div[2]/div/div[1]/p/text()')aboutHref = aboutHref[0] if len(aboutHref) > 0 else '' # python的三目運算 :為真時的結(jié)果 if 判定條件 else 為假時的結(jié)果price = price[0] if len(price) > 0 else ''title = title[0] if len(title) > 0 else ''contentDetail = contentDetail[0] if len(contentDetail) > 0 else ''publishDate = publishDate[0] if len(publishDate) > 0 else ''print aboutHref,price,title,contentDetail,publishDateexcept:print '出錯'if '_main_':getUrl()
我發(fā)現(xiàn)代碼運行完后,后面有幾頁數(shù)據(jù)沒有被爬取,我再也沒有辦法去訪問豬八戒網(wǎng)站了,等過了一段時間才能去訪問他們的網(wǎng)站,這就很尷尬了,我得防止被封IP
如何防止爬取數(shù)據(jù)的時候被網(wǎng)站封IP這里有一些套路.查了一些套路
1.修改請求頭
之前的爬蟲代碼沒有添加頭部,這里我添加了頭部,模擬成瀏覽器去訪問網(wǎng)站
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4295.400'headers = {'User-Agent': user_agent}htmlText = requests.get(url, headers=headers, proxies=proxies).text
2.采用代理IP
當自己的ip被網(wǎng)站封了之后,只能采用代理ip的方式進行爬取,所以每次爬取的時候盡量用代理ip來爬取,封了代理還有代理。
這里我引用了這個博客的一段代碼來生成ip地址:http://blog.csdn.net/lammonpeter/article/details/52917264
生成代理ip,大家可以直接把這個代碼拿去用
# coding=utf-8# IP地址取自國內(nèi)髙匿代理IP網(wǎng)站:http://www.xicidaili.com/nn/# 僅僅爬取首頁IP地址就足夠一般使用from bs4 import BeautifulSoupimport requestsimport randomdef get_ip_list(url, headers):web_data = requests.get(url, headers=headers)soup = BeautifulSoup(web_data.text, 'lxml')ips = soup.find_all('tr')ip_list = []for i in range(1, len(ips)):ip_info = ips[i]tds = ip_info.find_all('td')ip_list.append(tds[1].text + ':' + tds[2].text)return ip_listdef get_random_ip(ip_list):proxy_list = []for ip in ip_list:proxy_list.append('http://' + ip)proxy_ip = random.choice(proxy_list)proxies = {'http': proxy_ip}return proxiesif __name__ == '__main__':url = 'http://www.xicidaili.com/nn/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}ip_list = get_ip_list(url, headers=headers)proxies = get_random_ip(ip_list)print(proxies)
好了我用上面的代碼給我生成了一批ip地址(有些ip地址可能無效,但只要不封我自己的ip就可以了,哈哈),然后我就可以在我的請求頭部添加ip地址
給我們的請求添加代理ip
proxies = {'http': 'http://124.72.109.183:8118','http': 'http://49.85.1.79:31666'}user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4295.400'headers = {'User-Agent': user_agent}htmlText = requests.get(url, headers=headers, timeout=3, proxies=proxies).text
目前知道的就
最后完整代碼如下:
# coding=utf-8import requestsimport timefrom lxml import etreedef getUrl():for i in range(33):url = 'http://task.zbj.com/t-ppsj/p{}s5.html'.format(i+1)spiderPage(url)def spiderPage(url):if url is None:return Nonetry:proxies = {'http': 'http://221.202.248.52:80',}user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4295.400'headers = {'User-Agent': user_agent}htmlText = requests.get(url, headers=headers,proxies=proxies).textselector = etree.HTML(htmlText)tds = selector.xpath('//*[@class="tab-switch tab-progress"]/table/tr')for td in tds:price = td.xpath('./td/p/em/text()')href = td.xpath('./td/p/a/@href')title = td.xpath('./td/p/a/text()')subTitle = td.xpath('./td/p/text()')deadline = td.xpath('./td/span/text()')price = price[0] if len(price)>0 else '' # python的三目運算 :為真時的結(jié)果 if 判定條件 else 為假時的結(jié)果title = title[0] if len(title)>0 else ''href = href[0] if len(href)>0 else ''subTitle = subTitle[0] if len(subTitle)>0 else ''deadline = deadline[0] if len(deadline)>0 else ''print price,title,href,subTitle,deadlineprint '---------------------------------------------------------------------------------------'spiderDetail(href)except Exception,e:print '出錯',e.messagedef spiderDetail(url):if url is None:return Nonetry:htmlText = requests.get(url).textselector = etree.HTML(htmlText)aboutHref = selector.xpath('//*[@id="utopia_widget_10"]/div[1]/div/div/div/p[1]/a/@href')price = selector.xpath('//*[@id="utopia_widget_10"]/div[1]/div/div/div/p[1]/text()')title = selector.xpath('//*[@id="utopia_widget_10"]/div[1]/div/div/h2/text()')contentDetail = selector.xpath('//*[@id="utopia_widget_10"]/div[2]/div/div[1]/div[1]/text()')publishDate = selector.xpath('//*[@id="utopia_widget_10"]/div[2]/div/div[1]/p/text()')aboutHref = aboutHref[0] if len(aboutHref) > 0 else '' # python的三目運算 :為真時的結(jié)果 if 判定條件 else 為假時的結(jié)果price = price[0] if len(price) > 0 else ''title = title[0] if len(title) > 0 else ''contentDetail = contentDetail[0] if len(contentDetail) > 0 else ''publishDate = publishDate[0] if len(publishDate) > 0 else ''print aboutHref,price,title,contentDetail,publishDateexcept:print '出錯'if '_main_':getUrl()
最后程序完美運行,再也沒有出現(xiàn)被封IP的情況。當然防止被封IP肯定不止這些了,這還需要進一步探索!
聯(lián)系客服