diff --git a/crawlers/lib/crawl.py b/crawlers/lib/crawl.py index 8d3ab9f..d2e5f9e 100644 --- a/crawlers/lib/crawl.py +++ b/crawlers/lib/crawl.py @@ -6,11 +6,14 @@ import uuid from typing import List, Generator from flask import current_app +from requests import ConnectionError, Timeout, TooManyRedirects from crawlers.constants import BLOCK_KEY_CALLBACK_URL from crawlers.lib.platforms.i_crawler import ICrawler from crawlers.lib.platforms import platforms +from urllib3.exceptions import MaxRetryError +from requests.exceptions import RequestException logger = logging.getLogger(__name__) @@ -54,7 +57,11 @@ def process_block_url(session, block_url) -> None: f"skip crawl - no callback_url found! - key: {BLOCK_KEY_CALLBACK_URL}, block_data: {block_data}" ) else: - repos = run_block(block_data) + try: + repos = run_block(block_data) + except (MaxRetryError, ConnectionError, Timeout, TooManyRedirects): + logger.exception("hosting service not reachable - no indexer callback issued") + return _hoster_session_request( "PUT", session, url=block_data[BLOCK_KEY_CALLBACK_URL], json=repos ) @@ -76,8 +83,9 @@ def crawl(platform: ICrawler) -> Generator[List[dict], None, None]: else: # right now we dont want to emit failures (via yield) because that will send empty results back # to the indexer, which can trigger a state reset (i.e. reached end, start over). - # TODO deal with failures - what are they? + # - complete connection failures and such should be handled via raised exceptions within crawlers! pass + logger.debug(f"END block: {platform.type} - final state: {platform.state}") diff --git a/crawlers/lib/platforms/__init__.py b/crawlers/lib/platforms/__init__.py index 9910dac..624d719 100644 --- a/crawlers/lib/platforms/__init__.py +++ b/crawlers/lib/platforms/__init__.py @@ -1,15 +1,14 @@ -from typing import Dict, Any, Type, Union +from typing import Dict from crawlers.lib.platforms.i_crawler import ICrawler from crawlers.lib.platforms.gitea import GiteaCrawler from crawlers.lib.platforms.gitlab import GitLabCrawler from crawlers.lib.platforms.bitbucket import BitBucketCrawler -from crawlers.lib.platforms.github import GitHubV4Crawler, GitHubRESTCrawler +from crawlers.lib.platforms.github import GitHubV4Crawler platforms: Dict[str, ICrawler] = { GiteaCrawler.type: GiteaCrawler, GitLabCrawler.type: GitLabCrawler, GitHubV4Crawler.type: GitHubV4Crawler, - GitHubRESTCrawler.type: GitHubRESTCrawler, BitBucketCrawler.type: BitBucketCrawler, } diff --git a/crawlers/lib/platforms/bitbucket.py b/crawlers/lib/platforms/bitbucket.py index ecd4870..e60a7f7 100644 --- a/crawlers/lib/platforms/bitbucket.py +++ b/crawlers/lib/platforms/bitbucket.py @@ -1,7 +1,7 @@ import logging import time import requests -from typing import List, Tuple +from typing import List, Tuple, Union from urllib.parse import urljoin from crawlers.lib.platforms.i_crawler import ICrawler @@ -65,7 +65,7 @@ def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]: logger.error(e) logger.error(e.response.reason) logger.error(e.response.text) - return False, [], {} + return False, [], {}, e response_json = response.json() repos = response_json['values'] diff --git a/crawlers/lib/platforms/gitea.py b/crawlers/lib/platforms/gitea.py index 144d228..73565cd 100644 --- a/crawlers/lib/platforms/gitea.py +++ b/crawlers/lib/platforms/gitea.py @@ -1,5 +1,8 @@ import logging -from typing import List, Tuple +from typing import List, Tuple, Union + +from requests import ConnectionError, Timeout, TooManyRedirects +from urllib3.exceptions import MaxRetryError from crawlers.constants import GITEA_PER_PAGE_MAX, DEFAULT_REQUEST_TIMEOUT from crawlers.lib.platforms.i_crawler import ICrawler @@ -40,6 +43,10 @@ def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]: f"- response not ok, status: {response.status_code}") return False, [], state # nr.1 - we skip rest of this block, hope we get it next time result = response.json() + except (MaxRetryError, ConnectionError, Timeout, TooManyRedirects) as e: + logger.exception(f"{self} - crawler cannot reach hoster") + # we re-raise these, as we want to avoid returning empty results to the indexer + raise e except Exception as e: logger.exception(f"(skipping block chunk) gitea crawler crashed") return False, [], state # nr.2 - we skip rest of this block, hope we get it next time diff --git a/crawlers/lib/platforms/github/__init__.py b/crawlers/lib/platforms/github/__init__.py index 0182d72..d2b82b8 100644 --- a/crawlers/lib/platforms/github/__init__.py +++ b/crawlers/lib/platforms/github/__init__.py @@ -1,2 +1 @@ from .github_v4 import GitHubV4Crawler -from .github_rest import GitHubRESTCrawler diff --git a/crawlers/lib/platforms/github/github_rest.py b/crawlers/lib/platforms/github/github_rest.py deleted file mode 100644 index 7745c5a..0000000 --- a/crawlers/lib/platforms/github/github_rest.py +++ /dev/null @@ -1,249 +0,0 @@ -""" -Crawl through GitHub via their REST API. -Gets repositories connected to users. -""" -import logging -import time -from typing import List, Tuple -from urllib.parse import urljoin - -from crawlers.lib.platforms.i_crawler import ICrawler -from crawlers.constants import DEFAULT_REQUEST_TIMEOUT - -logger = logging.getLogger(__name__) - - -class GitHubRESTCrawler(ICrawler): - """ - Accept-Ranges: bytes - Content-Length: 32867 - X-GitHub-Request-Id: DE4C:7325:72FE05F:88CCA3F:5F74ECF6 - X-Ratelimit-Limit: 60 - X-Ratelimit-Remaining: 46 - X-Ratelimit-Reset: 1601501700 - X-Ratelimit-Used: 14 - access-control-allow-origin: * - access-control-expose-headers: ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, Deprecation, Sunset - cache-control: public, max-age=60, s-maxage=60 - content-encoding: gzip - content-security-policy: default-src 'none' - content-type: application/json; charset=utf-8 - date: Wed, 30 Sep 2020 20:39:07 GMT - etag: W/"f95a90519ac2d5dbc76753515500268383c3b666f6fdaf187d82444e25ba14a5" - link: ; rel="next", ; rel="first" - referrer-policy: origin-when-cross-origin, strict-origin-when-cross-origin - server: GitHub.com - status: 200 OK - strict-transport-security: max-age=31536000; includeSubdomains; preload - vary: Accept, Accept-Encoding, Accept, X-Requested-With, Accept-Encoding - x-content-type-options: nosniff - x-frame-options: deny - x-github-media-type: github.v3; format=json - x-xss-protection: 1; mode=block - """ - - type: str = 'github_rest' - - def __init__(self, base_url, state=None, api_key=None, **kwargs): - super().__init__( - base_url=base_url, - path='', - state=state, - api_key=api_key, - **kwargs - ) - if api_key: - self.requests.auth = ( - api_key['client_id'], - api_key['client_secret']) - - def request(self, url, params=None): - response = False - while not response: - try: - response = self.requests.get(url, params=params, timeout=DEFAULT_REQUEST_TIMEOUT) - response.raise_for_status() - except Exception as e: - logger.error(e) - - # todo: test this - logger.warning( - f'{self} sleeping for 10min...') - time.sleep(60 * 10) - response = False - return response - - def handle_ratelimit(self, response): - h = response.headers - ratelimit_remaining = int(h.get('X-Ratelimit-Remaining')) - ratelimit_reset_timestamp = int(h.get('X-Ratelimit-Reset')) - reset_in = ratelimit_reset_timestamp - time.time() - - logger.info( - f'{self} {ratelimit_remaining} requests remaining, reset in {reset_in}s') - if ratelimit_remaining < 1: - logger.warning( - f'{self} rate limiting: {ratelimit_remaining} requests remaining, sleeping {reset_in}s') - time.sleep(reset_in) - - def get_user_repos(self, user_repos_url): - while user_repos_url: - response = self.request(user_repos_url, params=dict(per_page=100)) - results = response.json() - - yield results - - self.handle_ratelimit(response) - header_next = response.links.get('next', {}) - user_repos_url = header_next.get('url', False) - - def crawl(self, state=None) -> Tuple[bool, List[dict], dict]: - """ :return: success, repos, state """ - user_url = False - if state: - user_url = state.get('user_url', False) - if not user_url: - logger.warning('{self} broken state, defaulting to start') - - if not user_url: - user_url = '/users' - - while user_url: - user_response = self.request(urljoin(self.base_url, user_url)) - self.handle_ratelimit(user_response) - - users_page = user_response.json() - for user in users_page: - user_repos = [] - for repo_page in self.get_user_repos(user['repos_url']): - logger.debug(f'{self} {len(repo_page)} repos in page') - user_repos += repo_page - state = {'user_url': user_url} - yield True, user_repos, state - - # https://stackoverflow.com/questions/32312758/python-requests-link-headers - user_header_next = user_response.links.get('next', {}) - user_url = user_header_next.get('url', False) - if not user_url: - # not hit rate limit, and we dont have a next url - finished! - # reset state - yield True, [], None - time.sleep(.01) - - """ expected GitHub result - { - "id": 1296269, - "node_id": "MDEwOlJlcG9zaXRvcnkxMjk2MjY5", - "name": "Hello-World", - "full_name": "octocat/Hello-World", - "owner": { - "login": "octocat", - "id": 1, - "node_id": "MDQ6VXNlcjE=", - "avatar_url": "https://github.com/images/error/octocat_happy.gif", - "gravatar_id": "", - "url": "https://api.github.com/users/octocat", - "html_url": "https://github.com/octocat", - "followers_url": "https://api.github.com/users/octocat/followers", - "following_url": "https://api.github.com/users/octocat/following{/other_user}", - "gists_url": "https://api.github.com/users/octocat/gists{/gist_id}", - "starred_url": "https://api.github.com/users/octocat/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/octocat/subscriptions", - "organizations_url": "https://api.github.com/users/octocat/orgs", - "repos_url": "https://api.github.com/users/octocat/repos", - "events_url": "https://api.github.com/users/octocat/events{/privacy}", - "received_events_url": "https://api.github.com/users/octocat/received_events", - "type": "User", - "site_admin": false - }, - "private": false, - "html_url": "https://github.com/octocat/Hello-World", - "description": "This your first repo!", - "fork": false, - "url": "https://api.github.com/repos/octocat/Hello-World", - "archive_url": "https://api.github.com/repos/octocat/Hello-World/{archive_format}{/ref}", - "assignees_url": "https://api.github.com/repos/octocat/Hello-World/assignees{/user}", - "blobs_url": "https://api.github.com/repos/octocat/Hello-World/git/blobs{/sha}", - "branches_url": "https://api.github.com/repos/octocat/Hello-World/branches{/branch}", - "collaborators_url": "https://api.github.com/repos/octocat/Hello-World/collaborators{/collaborator}", - "comments_url": "https://api.github.com/repos/octocat/Hello-World/comments{/number}", - "commits_url": "https://api.github.com/repos/octocat/Hello-World/commits{/sha}", - "compare_url": "https://api.github.com/repos/octocat/Hello-World/compare/{base}...{head}", - "contents_url": "https://api.github.com/repos/octocat/Hello-World/contents/{+path}", - "contributors_url": "https://api.github.com/repos/octocat/Hello-World/contributors", - "deployments_url": "https://api.github.com/repos/octocat/Hello-World/deployments", - "downloads_url": "https://api.github.com/repos/octocat/Hello-World/downloads", - "events_url": "https://api.github.com/repos/octocat/Hello-World/events", - "forks_url": "https://api.github.com/repos/octocat/Hello-World/forks", - "git_commits_url": "https://api.github.com/repos/octocat/Hello-World/git/commits{/sha}", - "git_refs_url": "https://api.github.com/repos/octocat/Hello-World/git/refs{/sha}", - "git_tags_url": "https://api.github.com/repos/octocat/Hello-World/git/tags{/sha}", - "git_url": "git:github.com/octocat/Hello-World.git", - "issue_comment_url": "https://api.github.com/repos/octocat/Hello-World/issues/comments{/number}", - "issue_events_url": "https://api.github.com/repos/octocat/Hello-World/issues/events{/number}", - "issues_url": "https://api.github.com/repos/octocat/Hello-World/issues{/number}", - "keys_url": "https://api.github.com/repos/octocat/Hello-World/keys{/key_id}", - "labels_url": "https://api.github.com/repos/octocat/Hello-World/labels{/name}", - "languages_url": "https://api.github.com/repos/octocat/Hello-World/languages", - "merges_url": "https://api.github.com/repos/octocat/Hello-World/merges", - "milestones_url": "https://api.github.com/repos/octocat/Hello-World/milestones{/number}", - "notifications_url": "https://api.github.com/repos/octocat/Hello-World/notifications{?since,all,participating}", - "pulls_url": "https://api.github.com/repos/octocat/Hello-World/pulls{/number}", - "releases_url": "https://api.github.com/repos/octocat/Hello-World/releases{/id}", - "ssh_url": "git@github.com:octocat/Hello-World.git", - "stargazers_url": "https://api.github.com/repos/octocat/Hello-World/stargazers", - "statuses_url": "https://api.github.com/repos/octocat/Hello-World/statuses/{sha}", - "subscribers_url": "https://api.github.com/repos/octocat/Hello-World/subscribers", - "subscription_url": "https://api.github.com/repos/octocat/Hello-World/subscription", - "tags_url": "https://api.github.com/repos/octocat/Hello-World/tags", - "teams_url": "https://api.github.com/repos/octocat/Hello-World/teams", - "trees_url": "https://api.github.com/repos/octocat/Hello-World/git/trees{/sha}", - "clone_url": "https://github.com/octocat/Hello-World.git", - "mirror_url": "git:git.example.com/octocat/Hello-World", - "hooks_url": "https://api.github.com/repos/octocat/Hello-World/hooks", - "svn_url": "https://svn.github.com/octocat/Hello-World", - "homepage": "https://github.com", - "language": null, - "forks_count": 9, - "stargazers_count": 80, - "watchers_count": 80, - "size": 108, - "default_branch": "master", - "open_issues_count": 0, - "is_template": true, - "topics": [ - "octocat", - "atom", - "electron", - "api" - ], - "has_issues": true, - "has_projects": true, - "has_wiki": true, - "has_pages": false, - "has_downloads": true, - "archived": false, - "disabled": false, - "visibility": "public", - "pushed_at": "2011-01-26T19:06:43Z", - "created_at": "2011-01-26T19:01:12Z", - "updated_at": "2011-01-26T19:14:43Z", - "permissions": { - "admin": false, - "push": false, - "pull": true - }, - "template_repository": "octocat/template", - "temp_clone_token": "ABTLWHOULUVAXGTRYU7OC2876QJ2O", - "delete_branch_on_merge": true, - "subscribers_count": 42, - "network_count": 0, - "license": { - "key": "mit", - "name": "MIT License", - "spdx_id": "MIT", - "url": "https://api.github.com/licenses/mit", - "node_id": "MDc6TGljZW5zZW1pdA==" - } - } - """ diff --git a/crawlers/lib/platforms/github/github_rest_legacy.py b/crawlers/lib/platforms/github/github_rest_legacy.py deleted file mode 100644 index 73ef24f..0000000 --- a/crawlers/lib/platforms/github/github_rest_legacy.py +++ /dev/null @@ -1,246 +0,0 @@ -""" -Legacy note: - -This crawler is not finished and wont be used. -It needs to run extra requests per repository (+1 per data-point) -to get the real data, which we dont want to do. -""" -import logging -import time -import math -from typing import List, Tuple -from urllib.parse import urljoin - -from crawlers.constants import DEFAULT_REQUEST_TIMEOUT - -from crawlers.lib.platforms.i_crawler import ICrawler - -logger = logging.getLogger(__name__) - - -class GitHubRESTCrawler(ICrawler): - """ - Accept-Ranges: bytes - Content-Length: 32867 - X-GitHub-Request-Id: DE4C:7325:72FE05F:88CCA3F:5F74ECF6 - X-Ratelimit-Limit: 60 - X-Ratelimit-Remaining: 46 - X-Ratelimit-Reset: 1601501700 - X-Ratelimit-Used: 14 - access-control-allow-origin: * - access-control-expose-headers: ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, Deprecation, Sunset - cache-control: public, max-age=60, s-maxage=60 - content-encoding: gzip - content-security-policy: default-src 'none' - content-type: application/json; charset=utf-8 - date: Wed, 30 Sep 2020 20:39:07 GMT - etag: W/"f95a90519ac2d5dbc76753515500268383c3b666f6fdaf187d82444e25ba14a5" - link: ; rel="next", ; rel="first" - referrer-policy: origin-when-cross-origin, strict-origin-when-cross-origin - server: GitHub.com - status: 200 OK - strict-transport-security: max-age=31536000; includeSubdomains; preload - vary: Accept, Accept-Encoding, Accept, X-Requested-With, Accept-Encoding - x-content-type-options: nosniff - x-frame-options: deny - x-github-media-type: github.v3; format=json - x-xss-protection: 1; mode=block - """ - - type: str = 'github_rest_legacy' - - def __init__(self, base_url, state=None, api_key=None, **kwargs): - super().__init__( - base_url=base_url, - path='', - state=state, - api_key=api_key, - **kwargs - ) - if api_key: - self.requests.headers.update( - {"Authorization": f"Bearer {api_key['access_token']}"}) - - def request(self, url, params=None): - response = False - while not response: - try: - response = self.requests.get(url, params=params, timeout=DEFAULT_REQUEST_TIMEOUT) - response.raise_for_status() - except Exception as e: - logger.error(e) - - # todo: test this - logger.warning( - f'{self} sleeping for 10min...') - time.sleep(60 * 10) - response = False - return response - - def handle_ratelimit(self, response): - h = response.headers - ratelimit_remaining = int(h.get('X-Ratelimit-Remaining')) - ratelimit_reset_timestamp = int(h.get('X-Ratelimit-Reset')) - reset_in = ratelimit_reset_timestamp - time.time() - - logger.info( - f'{self} {ratelimit_remaining} requests remaining, reset in {reset_in}s') - if ratelimit_remaining < 1: - logger.warning( - f'{self} rate limiting: {ratelimit_remaining} requests remaining, sleeping {reset_in}s') - time.sleep(reset_in) - - @staticmethod - def get_next_link(response) -> (str, int): - link = None - index = None - pagination = response.headers.get('link', '') - if 'next' in pagination: - # should contain the following: - # ; rel="next", ; rel="first" - link = pagination.split(">")[0][1:] - index = int(link.split("since=")[1].split("&")[0]) - return link, index - - def init_state(self, state: dict = None): - if not state: - state = {} - state['start_at'] = state.get('start_at', 0) - state['end_at'] = state.get('end_at', math.inf) - state['current'] = state.get('current', 0) - state['next_link'] = state.get('next_link', urljoin(self.base_url, f'/repositories?since={state["start_at"]}')) - return state - - def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]: - """ :return: success, repos, state """ - state = self.init_state(state) - while state["next_link"]: - time.sleep(.01) # default self-throttling - repo_response = self.request(state["next_link"]) - self.handle_ratelimit(repo_response) # sleep when needed - - results = repo_response.json() - logger.debug(f'{self} {len(results)} repos in page') - state["next_link"], state["current"] = self.get_next_link(repo_response) - if len(results) == 0 or state["current"] >= state["end_at"]: - state["next_link"] = None # finished - - yield True, results, state - - """ expected GitHub response - { - "id": 1296269, - "node_id": "MDEwOlJlcG9zaXRvcnkxMjk2MjY5", - "name": "Hello-World", - "full_name": "octocat/Hello-World", - "owner": { - "login": "octocat", - "id": 1, - "node_id": "MDQ6VXNlcjE=", - "avatar_url": "https://github.com/images/error/octocat_happy.gif", - "gravatar_id": "", - "url": "https://api.github.com/users/octocat", - "html_url": "https://github.com/octocat", - "followers_url": "https://api.github.com/users/octocat/followers", - "following_url": "https://api.github.com/users/octocat/following{/other_user}", - "gists_url": "https://api.github.com/users/octocat/gists{/gist_id}", - "starred_url": "https://api.github.com/users/octocat/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/octocat/subscriptions", - "organizations_url": "https://api.github.com/users/octocat/orgs", - "repos_url": "https://api.github.com/users/octocat/repos", - "events_url": "https://api.github.com/users/octocat/events{/privacy}", - "received_events_url": "https://api.github.com/users/octocat/received_events", - "type": "User", - "site_admin": false - }, - "private": false, - "html_url": "https://github.com/octocat/Hello-World", - "description": "This your first repo!", - "fork": false, - "url": "https://api.github.com/repos/octocat/Hello-World", - "archive_url": "https://api.github.com/repos/octocat/Hello-World/{archive_format}{/ref}", - "assignees_url": "https://api.github.com/repos/octocat/Hello-World/assignees{/user}", - "blobs_url": "https://api.github.com/repos/octocat/Hello-World/git/blobs{/sha}", - "branches_url": "https://api.github.com/repos/octocat/Hello-World/branches{/branch}", - "collaborators_url": "https://api.github.com/repos/octocat/Hello-World/collaborators{/collaborator}", - "comments_url": "https://api.github.com/repos/octocat/Hello-World/comments{/number}", - "commits_url": "https://api.github.com/repos/octocat/Hello-World/commits{/sha}", - "compare_url": "https://api.github.com/repos/octocat/Hello-World/compare/{base}...{head}", - "contents_url": "https://api.github.com/repos/octocat/Hello-World/contents/{+path}", - "contributors_url": "https://api.github.com/repos/octocat/Hello-World/contributors", - "deployments_url": "https://api.github.com/repos/octocat/Hello-World/deployments", - "downloads_url": "https://api.github.com/repos/octocat/Hello-World/downloads", - "events_url": "https://api.github.com/repos/octocat/Hello-World/events", - "forks_url": "https://api.github.com/repos/octocat/Hello-World/forks", - "git_commits_url": "https://api.github.com/repos/octocat/Hello-World/git/commits{/sha}", - "git_refs_url": "https://api.github.com/repos/octocat/Hello-World/git/refs{/sha}", - "git_tags_url": "https://api.github.com/repos/octocat/Hello-World/git/tags{/sha}", - "git_url": "git:github.com/octocat/Hello-World.git", - "issue_comment_url": "https://api.github.com/repos/octocat/Hello-World/issues/comments{/number}", - "issue_events_url": "https://api.github.com/repos/octocat/Hello-World/issues/events{/number}", - "issues_url": "https://api.github.com/repos/octocat/Hello-World/issues{/number}", - "keys_url": "https://api.github.com/repos/octocat/Hello-World/keys{/key_id}", - "labels_url": "https://api.github.com/repos/octocat/Hello-World/labels{/name}", - "languages_url": "https://api.github.com/repos/octocat/Hello-World/languages", - "merges_url": "https://api.github.com/repos/octocat/Hello-World/merges", - "milestones_url": "https://api.github.com/repos/octocat/Hello-World/milestones{/number}", - "notifications_url": "https://api.github.com/repos/octocat/Hello-World/notifications{?since,all,participating}", - "pulls_url": "https://api.github.com/repos/octocat/Hello-World/pulls{/number}", - "releases_url": "https://api.github.com/repos/octocat/Hello-World/releases{/id}", - "ssh_url": "git@github.com:octocat/Hello-World.git", - "stargazers_url": "https://api.github.com/repos/octocat/Hello-World/stargazers", - "statuses_url": "https://api.github.com/repos/octocat/Hello-World/statuses/{sha}", - "subscribers_url": "https://api.github.com/repos/octocat/Hello-World/subscribers", - "subscription_url": "https://api.github.com/repos/octocat/Hello-World/subscription", - "tags_url": "https://api.github.com/repos/octocat/Hello-World/tags", - "teams_url": "https://api.github.com/repos/octocat/Hello-World/teams", - "trees_url": "https://api.github.com/repos/octocat/Hello-World/git/trees{/sha}", - "clone_url": "https://github.com/octocat/Hello-World.git", - "mirror_url": "git:git.example.com/octocat/Hello-World", - "hooks_url": "https://api.github.com/repos/octocat/Hello-World/hooks", - "svn_url": "https://svn.github.com/octocat/Hello-World", - "homepage": "https://github.com", - "language": null, - "forks_count": 9, - "stargazers_count": 80, - "watchers_count": 80, - "size": 108, - "default_branch": "master", - "open_issues_count": 0, - "is_template": true, - "topics": [ - "octocat", - "atom", - "electron", - "api" - ], - "has_issues": true, - "has_projects": true, - "has_wiki": true, - "has_pages": false, - "has_downloads": true, - "archived": false, - "disabled": false, - "visibility": "public", - "pushed_at": "2011-01-26T19:06:43Z", - "created_at": "2011-01-26T19:01:12Z", - "updated_at": "2011-01-26T19:14:43Z", - "permissions": { - "admin": false, - "push": false, - "pull": true - }, - "template_repository": "octocat/template", - "temp_clone_token": "ABTLWHOULUVAXGTRYU7OC2876QJ2O", - "delete_branch_on_merge": true, - "subscribers_count": 42, - "network_count": 0, - "license": { - "key": "mit", - "name": "MIT License", - "spdx_id": "MIT", - "url": "https://api.github.com/licenses/mit", - "node_id": "MDc6TGljZW5zZW1pdA==" - } - } - """ diff --git a/crawlers/lib/platforms/github/github_v4.py b/crawlers/lib/platforms/github/github_v4.py index 47a349e..34d9bda 100644 --- a/crawlers/lib/platforms/github/github_v4.py +++ b/crawlers/lib/platforms/github/github_v4.py @@ -7,9 +7,11 @@ import logging import time import base64 -from typing import List, Tuple +from typing import List, Tuple, Union from iso8601 import iso8601 from requests import Response +from requests.exceptions import Timeout, TooManyRedirects, ConnectionError +from urllib3.exceptions import MaxRetryError from crawlers.lib.platforms.i_crawler import ICrawler from crawlers.constants import ( @@ -195,10 +197,9 @@ def send_query() -> Response: failed_count = 0 while response.status_code == 403 and failed_count < GITHUB_ABUSE_RETRY_MAX: # we sometimes run in to some "hidden" abuse detection on multiple crawlers - # it tells use to wait a few minutes, but a few seconds is enough to be allowed again + # it may tell us to wait a few minutes, but a few seconds is enough to be allowed again # thus, we repeatedly try again to avoid having holes in our data (skipped block chunks) - # TODO don't see a way to avoid triggering this right now - # TODO it triggers even though we have plenty of ratelimit to spare + # however, it might be other reason and other severities - hence we limit the retries failed_count += 1 logger.warning(f"status 403 - retry block chunk inĀ {GITHUB_API_ABUSE_SLEEP}s" f"- probably triggered abuse flag? json:\n{response.json()}") @@ -235,6 +236,10 @@ def send_query() -> Response: yield False, [], state self.handle_ratelimit(response) + except (MaxRetryError, ConnectionError, Timeout, TooManyRedirects) as e: + logger.exception(f"{self} - crawler cannot reach hoster") + # we re-raise these, as we want to avoid returning empty results to the indexer + raise e except Exception as e: logger.exception(f"(skipping block chunk) github crawler crashed") yield False, [], state diff --git a/crawlers/lib/platforms/github/github_v4_legacy.py b/crawlers/lib/platforms/github/github_v4_legacy.py deleted file mode 100644 index af29517..0000000 --- a/crawlers/lib/platforms/github/github_v4_legacy.py +++ /dev/null @@ -1,120 +0,0 @@ -""" -Legacy note: - -There is a limit at 1000 results for the search api (both rest and graphql) -https://stackoverflow.com/questions/48371313/github-api-pagination-limit -https://developer.github.com/v3/search/#about-the-search-api - -This version is not usable for us as we cannot get around this limit! -""" -import pathlib -import logging -import time -from typing import List, Tuple -from iso8601 import iso8601 -from urllib.parse import urljoin - -from crawlers.lib.platforms.i_crawler import ICrawler -from crawlers.constants import DEFAULT_REQUEST_TIMEOUT - -logger = logging.getLogger(__name__) - - -def get_query(): - current_folder_path = pathlib.Path(__file__).parent.absolute() - with open(current_folder_path.joinpath('query_repos_search.graphql')) as f: - query = f.read() - return query - - -query = get_query() - - -class GitHubV4Crawler(ICrawler): - """ - """ - type: str = 'github_v4_legacy' - - def __init__(self, base_url, state=None, api_key=None, **kwargs): - super().__init__( - base_url=base_url, - path='graphql', - state=state, - api_key=api_key, - **kwargs - ) - if api_key: - self.requests.headers.update( - {"Authorization": f"Bearer {api_key}"}) - - def handle_ratelimit(self, response): - """ - { - "data": { - "rateLimit": { - "cost": 1, - "remaining": 4984, - "resetAt": "2020-11-29T14:26:15Z" - }, - """ - rate_limit = response.json().get('data').get('rateLimit') - ratelimit_remaining = rate_limit['remaining'] - - reset_at = iso8601.parse_date(rate_limit['resetAt']) - ratelimit_reset_timestamp = reset_at.timestamp() - - reset_in = ratelimit_reset_timestamp - time.time() - - logger.info( - f'{self} {ratelimit_remaining} requests remaining, reset in {reset_in}s') - if ratelimit_remaining < 1: - logger.warning( - f'{self} rate limiting: {ratelimit_remaining} requests remaining, sleeping {reset_in}s') - time.sleep(reset_in) - - def get_variables(self, cursor): - # todo: there is no way to order by created_at? - # -> https://github.community/t/graphql-sorting-search-results/14088/2 - variables = { - #"queryString": "is:public archived:false created:2020-11-28T13:00:00Z..2020-11-28T14:00:00Z" - - "queryString": "is:public", - "cursor": cursor, - } - return variables - - def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]: - """ :return: success, repos, state """ - cursor = None - if state: - cursor = state.get('cursor', None) - - hasNextPage = True - while hasNextPage: - variables = self.get_variables(cursor) - response = self.requests.post( - urljoin(self.base_url, self.path), - json=dict(query=query, variables=variables), - timeout=DEFAULT_REQUEST_TIMEOUT - ) - try: - data = response.json() - edges = data['data']['search']['edges'] - - page_info = data['search']['pageInfo'] - cursor = page_info['endCursor'] - hasNextPage = page_info['hasNextPage'] - - repos = [result['node'] for result in edges] - - print(len(repos)) - print(hasNextPage) - - state = dict(cursor=cursor) - yield True, repos, state - - self.handle_ratelimit(response) - except Exception as e: - logger.error(f'failed. response was: {response.json()}') - raise e - time.sleep(.01) diff --git a/crawlers/lib/platforms/github/query_repos_search.graphql b/crawlers/lib/platforms/github/query_repos_search.graphql deleted file mode 100644 index e5ab97f..0000000 --- a/crawlers/lib/platforms/github/query_repos_search.graphql +++ /dev/null @@ -1,37 +0,0 @@ -query listRepos($queryString: String!, $cursor: String) { - rateLimit { - cost - remaining - resetAt - } - search(query: $queryString, type: REPOSITORY, first: 100, after: $cursor) { - repositoryCount - pageInfo { - startCursor - endCursor - hasNextPage - } - edges { - node { - ... on Repository { - id - name - createdAt - updatedAt - pushedAt - description - isArchived - isPrivate - url - owner { - login - id - __typename - url - } - } - } - } - } -} - diff --git a/crawlers/lib/platforms/gitlab.py b/crawlers/lib/platforms/gitlab.py index 7318e73..a99ae62 100644 --- a/crawlers/lib/platforms/gitlab.py +++ b/crawlers/lib/platforms/gitlab.py @@ -2,11 +2,15 @@ import time from typing import List, Tuple +from requests import ConnectionError, Timeout, TooManyRedirects +from urllib3.exceptions import MaxRetryError + from crawlers.constants import GITLAB_PER_PAGE_MAX, DEFAULT_REQUEST_TIMEOUT from crawlers.lib.platforms.i_crawler import ICrawler logger = logging.getLogger(__name__) + class GitLabCrawler(ICrawler): type: str = 'gitlab' @@ -29,7 +33,7 @@ def set_state(cls, state: dict = None) -> dict: state = super().set_state(state) return state - def handle_ratelimit(self, response = None): + def handle_ratelimit(self, response=None): if response: remaining = int(response.headers.get("RateLimit-Remaining", -1)) reset_ts = int(response.headers.get("RateLimit-Reset", -1)) @@ -62,6 +66,10 @@ def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]: logger.warning(response.headers.__dict__) return False, [], state # nr.1 - we skip rest of this block, hope we get it next time repos = response.json() + except (MaxRetryError, ConnectionError, Timeout, TooManyRedirects) as e: + logger.exception(f"{self} - crawler cannot reach hoster") + # we re-raise these, as we want to avoid returning empty results to the indexer + raise e except Exception as e: logger.exception(f"(skipping block chunk) gitlab crawler crashed") return False, [], state # nr.2 - we skip rest of this block, hope we get it next time diff --git a/crawlers/lib/platforms/i_crawler.py b/crawlers/lib/platforms/i_crawler.py index 87a91f4..794dd81 100644 --- a/crawlers/lib/platforms/i_crawler.py +++ b/crawlers/lib/platforms/i_crawler.py @@ -4,7 +4,7 @@ import requests import time from urllib.parse import urljoin -from typing import List, Tuple +from typing import List, Tuple, Union from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry @@ -42,7 +42,7 @@ def handle_ratelimit(self, response=None): time.sleep(CRAWLER_DEFAULT_THROTTLE) def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]: - """ :return: success, repos, state """ + """ :return: success, repos, state, Exception (if any) """ raise NotImplementedError @staticmethod