HubGrep · geoheelias · Aug 13, 2021 · Aug 13, 2021 · Aug 13, 2021 · Aug 13, 2021
diff --git a/crawlers/lib/crawl.py b/crawlers/lib/crawl.py
@@ -6,11 +6,14 @@
 import uuid
 from typing import List, Generator
 from flask import current_app
+from requests import ConnectionError, Timeout, TooManyRedirects
 
 from crawlers.constants import BLOCK_KEY_CALLBACK_URL
 
 from crawlers.lib.platforms.i_crawler import ICrawler
 from crawlers.lib.platforms import platforms
+from urllib3.exceptions import MaxRetryError
+from requests.exceptions import RequestException
 
 logger = logging.getLogger(__name__)
 
@@ -54,7 +57,11 @@ def process_block_url(session, block_url) -> None:
             f"skip crawl - no callback_url found! - key: {BLOCK_KEY_CALLBACK_URL}, block_data: {block_data}"
         )
     else:
-        repos = run_block(block_data)
+        try:
+            repos = run_block(block_data)
+        except (MaxRetryError, ConnectionError, Timeout, TooManyRedirects):
+            logger.exception("hosting service not reachable - no indexer callback issued")
+            return
         _hoster_session_request(
             "PUT", session, url=block_data[BLOCK_KEY_CALLBACK_URL], json=repos
         )
@@ -68,15 +75,15 @@ def crawl(platform: ICrawler) -> Generator[List[dict], None, None]:
     :param platform: which platform to crawl, with what credentials
     """
     logger.debug(f"START block: {platform.type} - initial state: {platform.state}")
-    for success, block_chunk, state in platform.crawl():
+    for success, block_chunk, state, exception in platform.crawl():
         if success:
             logger.info(f"got {len(block_chunk)} results from {platform} "
                         f"- first repo id: {next(iter(block_chunk), {}).get('id', None)}")
             yield block_chunk
         else:
             # right now we dont want to emit failures (via yield) because that will send empty results back
             # to the indexer, which can trigger a state reset (i.e. reached end, start over).
-            # TODO deal with failures - what are they?
+            # - complete connection failures and such should be handled via raised exceptions within crawlers!
             pass
     logger.debug(f"END block: {platform.type} - final state: {platform.state}")
 

diff --git a/crawlers/lib/platforms/__init__.py b/crawlers/lib/platforms/__init__.py
@@ -1,15 +1,14 @@
-from typing import Dict, Any, Type, Union
+from typing import Dict
 from crawlers.lib.platforms.i_crawler import ICrawler
 from crawlers.lib.platforms.gitea import GiteaCrawler
 from crawlers.lib.platforms.gitlab import GitLabCrawler
 from crawlers.lib.platforms.bitbucket import BitBucketCrawler
-from crawlers.lib.platforms.github import GitHubV4Crawler, GitHubRESTCrawler
+from crawlers.lib.platforms.github import GitHubV4Crawler
 
 platforms: Dict[str, ICrawler] = {
     GiteaCrawler.type: GiteaCrawler,
     GitLabCrawler.type: GitLabCrawler,
     GitHubV4Crawler.type: GitHubV4Crawler,
-    GitHubRESTCrawler.type: GitHubRESTCrawler,
     BitBucketCrawler.type: BitBucketCrawler,
 }
 
diff --git a/crawlers/lib/platforms/bitbucket.py b/crawlers/lib/platforms/bitbucket.py
@@ -1,7 +1,7 @@
 import logging
 import time
 import requests
-from typing import List, Tuple
+from typing import List, Tuple, Union
 from urllib.parse import urljoin
 
 from crawlers.lib.platforms.i_crawler import ICrawler
@@ -46,7 +46,7 @@ def request(self, url):
 
         return self.requests.get(url, timeout=DEFAULT_REQUEST_TIMEOUT)
 
-    def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]:
+    def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict, Union[Exception, None]]:
         """ :return: success, repos, state """
         url = False
         if state:
@@ -65,7 +65,7 @@ def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]:
                 logger.error(e)
                 logger.error(e.response.reason)
                 logger.error(e.response.text)
-                return False, [], {}
+                return False, [], {}, e
 
             response_json = response.json()
             repos = response_json['values']

diff --git a/crawlers/lib/platforms/gitea.py b/crawlers/lib/platforms/gitea.py
@@ -1,5 +1,8 @@
 import logging
-from typing import List, Tuple
+from typing import List, Tuple, Union
+
+from requests import ConnectionError, Timeout, TooManyRedirects
+from urllib3.exceptions import MaxRetryError
 
 from crawlers.constants import GITEA_PER_PAGE_MAX, DEFAULT_REQUEST_TIMEOUT
 from crawlers.lib.platforms.i_crawler import ICrawler
@@ -25,7 +28,7 @@ def set_state(cls, state: dict = None) -> dict:
         state = super().set_state(state)
         return state
 
-    def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]:
+    def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict, Union[Exception, None]]:
         state = state or self.state
         while self.has_next_crawl(state):
             params = dict(
@@ -40,9 +43,13 @@ def crawl(self, state: dict = None) -> Tuple[bool, List[dict], dict]:
                                    f"- response not ok, status: {response.status_code}")
                     return False, [], state  # nr.1 - we skip rest of this block, hope we get it next time
                 result = response.json()
+            except (MaxRetryError, ConnectionError, Timeout, TooManyRedirects) as e:
+                logger.exception(f"{self} - crawler cannot reach hoster")
+                # we re-raise these, as we want to avoid returning empty results to the indexer
+                raise e
             except Exception as e:
                 logger.exception(f"(skipping block chunk) gitea crawler crashed")
-                return False, [], state  # nr.2 - we skip rest of this block, hope we get it next time
+                return False, [], state, e  # nr.2 - we skip rest of this block, hope we get it next time
 
             state['is_done'] = len(result['data']) != state['per_page']  # finish early, we reached the end
 

diff --git a/crawlers/lib/platforms/github/__init__.py b/crawlers/lib/platforms/github/__init__.py
@@ -1,2 +1 @@
 from .github_v4 import GitHubV4Crawler
-from .github_rest import GitHubRESTCrawler
diff --git a/crawlers/lib/platforms/github/github_rest.py b/crawlers/lib/platforms/github/github_rest.py
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1 @@
		from .github_v4 import GitHubV4Crawler
		from .github_rest import GitHubRESTCrawler