You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

213 lines
9.1 KiB

from requests.exceptions import RequestException
from requests.models import Response
from requests.sessions import Session
import logging
import random
import re
import time
from _23 import b64encodestring, urlparse
DEFAULT_USER_AGENTS = [
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/50.0.2661.102 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/52.0.2743.116 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0)'
' Gecko/20100101 Firefox/46.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0)'
' Gecko/20100101 Firefox/41.0'
]
class CloudflareError(RequestException):
pass
class CloudflareScraper(Session):
def __init__(self, **kwargs):
super(CloudflareScraper, self).__init__()
if 'requests' in self.headers['User-Agent']:
# Set a random User-Agent if no custom User-Agent has been set
self.headers['User-Agent'] = random.choice(DEFAULT_USER_AGENTS)
self.cf_ua = self.headers['User-Agent']
self.default_delay = 8
self.delay = kwargs.pop('delay', self.default_delay)
self.start_time = None
self.trust_env = False
def request(self, method, url, *args, **kwargs):
url_solver = kwargs.pop('url_solver', None)
if not kwargs.pop('proxy_browser', None):
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
else:
resp = self.get_content(method, url, url_solver,
user_agent=self.headers.get('User-Agent'), proxy_browser=True, **kwargs)
if (isinstance(resp, type(Response()))
and resp.status_code in (503, 429, 403)):
self.start_time = time.time()
if (re.search('(?i)cloudflare', resp.headers.get('Server', ''))
and b'jschl_vc' in resp.content
and b'jschl_answer' in resp.content):
resp = self.solve_cf_challenge(resp, url_solver, **kwargs)
elif b'ddgu' in resp.content:
resp = self.solve_ddg_challenge(resp, **kwargs)
return resp
def wait(self):
delay = self.delay - (time.time() - self.start_time)
time.sleep((0, delay)[0 < delay]) # required delay before solving the challenge
def solve_ddg_challenge(self, resp, **original_kwargs):
parsed_url = urlparse(resp.url)
try:
submit_url = parsed_url.scheme + ':' + re.findall('"frm"[^>]+?action="([^"]+)"', resp.text)[0]
kwargs = {k: v for k, v in original_kwargs.items() if k not in ['hooks']}
kwargs.setdefault('headers', {})
kwargs.setdefault('data', dict(
h=b64encodestring('%s://%s' % (parsed_url.scheme, parsed_url.hostname)),
u=b64encodestring(parsed_url.path), p=b64encodestring(parsed_url.port or '')
))
self.wait()
resp = self.request('POST', submit_url, **kwargs)
except (BaseException, Exception):
pass
return resp
def test_flaresolverr(self, url_solver):
# test if FlareSolverr software is running
response_test = super(CloudflareScraper, self).request('GET', url_solver)
fs_ver = None
if 200 == response_test.status_code and response_test.ok:
json_data = response_test.json()
if 'ok' == json_data.get('status'):
fs_ver = json_data.get('version')
if None is fs_ver:
raise ValueError('FlareSolverr software not found (is it running?)')
return fs_ver
def get_content(self, method, url, url_solver, user_agent, proxy_browser=False, **kwargs):
url_solver = url_solver and re.sub(r'(/|v1)*$', '', url_solver) or 'http://localhost:8191'
if not self.test_flaresolverr(url_solver):
raise ValueError('No FlareSolverr software running %sat %s' % (('to solve Cloudflare challenge ',
'')[proxy_browser], url_solver))
try:
response = super(CloudflareScraper, self).request('POST', '%s/v1' % url_solver, json=dict(
cmd='request.%s' % method.lower(), userAgent=user_agent, url=url,
cookies=[{'name': cur_ckee.name, 'value': cur_ckee.value,
'domain': cur_ckee.domain, 'path': cur_ckee.path} for cur_ckee in self.cookies]))
except(BaseException, Exception) as e:
raise ValueError('FlareSolverr software unable to %s: %r' % (('solve Cloudflare anti-bot IUAM challenge',
'fetch content')[proxy_browser], e))
if None is not response:
data_json = response.json()
result = ({}, data_json)[isinstance(data_json, (dict, list))]
if response.ok:
if 'ok' == result.get('status'):
self.cookies.clear()
for cur_ckee in result.get('solution', {}).get('cookies', []):
if cur_ckee.get('value') and cur_ckee.get('name') not in ('', None, '_gid', '_ga', '_gat'):
self.cookies.set(
cur_ckee['name'], cur_ckee['value'],
rest={'httpOnly': cur_ckee.get('httpOnly'), 'session': cur_ckee.get('session')},
**dict([(k, cur_ckee.get(k)) for k in ('expires', 'domain', 'path', 'secure')]))
else:
response = None
elif 'error' == result.get('status'):
raise ValueError('Failure with FlareSolverr: %s' % result.get('message', 'See the FlareSolver output'))
return response
def solve_cf_challenge(self, resp, url_solver, **original_kwargs):
body = resp.text
parsed_url = urlparse(resp.url)
domain = parsed_url.netloc
if '/cdn-cgi/l/chk_captcha' in body or 'cf_chl_captcha' in body:
raise CloudflareError(
'Cloudflare captcha presented for %s, safe to ignore as this shouldn\'t happen every time, ua: %s' %
(domain, self.cf_ua), response=resp)
if None is self.get_content(
'GET', (resp.request.url, '%s://%s/' % (parsed_url.scheme, domain))['POST' == resp.request.method],
url_solver, user_agent=resp.request.headers.get('User-Agent')):
raise ValueError('Failed to validate Cloudflare anti-bot IUAM challenge')
time.sleep(1.2)
final_response = super(CloudflareScraper, self).request(resp.request.method, resp.request.url, headers={
'User-Agent': resp.request.headers.get('User-Agent')}, **original_kwargs)
# if final_response and 200 == getattr(final_response, 'status_code'):
# return final_response
return final_response
@classmethod
def create_scraper(cls, sess=None, **kwargs):
"""
Convenience function for creating a ready-to-go CloudflareScraper object.
"""
scraper = cls(**kwargs)
if sess:
attrs = ['auth', 'cert', 'cookies', 'headers', 'hooks', 'params', 'proxies', 'data']
for attr in attrs:
val = getattr(sess, attr, None)
if val:
setattr(scraper, attr, val)
return scraper
# Functions for integrating cloudflare-scrape with other applications and scripts
@classmethod
def get_tokens(cls, url, user_agent=None, **kwargs):
scraper = cls.create_scraper()
if user_agent:
scraper.headers['User-Agent'] = user_agent
try:
resp = scraper.get(url, **kwargs)
resp.raise_for_status()
except (BaseException, Exception):
logging.error('[%s] returned an error. Could not collect tokens.' % url)
raise
domain = urlparse(resp.url).netloc
for d in scraper.cookies.list_domains():
if d.startswith('.') and d in ('.' + domain):
cookie_domain = d
break
else:
raise ValueError('Unable to find Cloudflare cookies.'
' Does the site actually have Cloudflare IUAM (\'I\'m Under Attack Mode\') enabled?')
return (
{'__cfduid': scraper.cookies.get('__cfduid', '', domain=cookie_domain),
'cf_clearance': scraper.cookies.get('cf_clearance', '', domain=cookie_domain)},
scraper.headers['User-Agent'])
@classmethod
def get_cookie_string(cls, url, user_agent=None, **kwargs):
"""
Convenience function for building a Cookie HTTP header value.
"""
tokens, user_agent = cls.get_tokens(url, user_agent=user_agent, **kwargs)
return '; '.join(['='.join(pair) for pair in tokens.items()]), user_agent
create_scraper = CloudflareScraper.create_scraper
get_tokens = CloudflareScraper.get_tokens
get_cookie_string = CloudflareScraper.get_cookie_string