diff --git a/README.md b/README.md index 01928d6..432ed6f 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,18 @@ artifactory-cleanup --help configuration for multiple repositories and some of them are not found. - Use `--worker-count=` to increase the number of workers. By default, it's 1. It's useful when you have a lot of artifacts and you want to speed up the process. +- Use `page_size` on a policy to fetch artifacts in pages instead of a single request. This can prevent timeouts on + large repositories. By default all artifacts are fetched at once. + +```yaml + - name: Remove all files from repo-name-here older than 7 days + page_size: 5000 # fetch 5000 artifacts per request + rules: + - rule: Repo + name: "reponame" + - rule: DeleteOlderThan + days: 7 +``` ## Commands ## diff --git a/artifactory_cleanup/loaders.py b/artifactory_cleanup/loaders.py index 63aff57..14833eb 100644 --- a/artifactory_cleanup/loaders.py +++ b/artifactory_cleanup/loaders.py @@ -86,8 +86,9 @@ def get_root_schema(self, rules): policy_schema = cfgv.Map( "Policy", "name", - cfgv.NoAdditionalKeys(["name", "rules"]), + cfgv.NoAdditionalKeys(["name", "page_size", "rules"]), cfgv.Required("name", cfgv.check_string), + cfgv.Optional("page_size", cfgv.check_int, None), cfgv.RequiredRecurse("rules", cfgv.Array(rule_schema)), ) @@ -151,6 +152,7 @@ def get_policies(self) -> List[CleanupPolicy]: for policy_data in config["artifactory-cleanup"]["policies"]: policy_name = policy_data["name"] + page_size = policy_data.get("page_size") rules = [] for rule_data in policy_data["rules"]: try: @@ -164,7 +166,7 @@ def get_policies(self) -> List[CleanupPolicy]: sys.exit(1) rules.append(rule) - policy = CleanupPolicy(policy_name, *rules) + policy = CleanupPolicy(policy_name, *rules, page_size=page_size) policies.append(policy) return policies diff --git a/artifactory_cleanup/rules/base.py b/artifactory_cleanup/rules/base.py index 733f44a..8d05808 100644 --- a/artifactory_cleanup/rules/base.py +++ b/artifactory_cleanup/rules/base.py @@ -126,6 +126,15 @@ def aql_add_filter(self, filters: List) -> List: """ return filters + def aql_add_include(self, includes: set) -> set: + """ + Declare which extra AQL fields this rule needs in `.include()`. + + The field `*` (all basic fields: repo, path, name, size, created, etc.) is always included. + Override this method and add `property` or `stat` only when your filter actually reads those fields. + """ + return includes + def aql_add_text(self, aql: str) -> str: """ You can change AQL text after applying all rules filters. @@ -167,7 +176,7 @@ class CleanupPolicy(object): session: Optional[BaseUrlSession] = None today: date = None - def __init__(self, name: str, *rules: Rule): + def __init__(self, name: str, *rules: Rule, page_size: Optional[int] = None): if not isinstance(name, str): raise ValueError( "Bad CleanupPolicy, first argument must be name.\n" @@ -177,6 +186,7 @@ def __init__(self, name: str, *rules: Rule): self.name = name self.rules = list(rules) self.aql_text = None + self.page_size = page_size # init object if passed not initialized class # for `rules.repo` rule, see above in the docstring @@ -252,8 +262,16 @@ def _get_aql_text(self, find_filters: Dict) -> str: """ Collect from all rules additional texts of requests """ + includes = {"*",} + for rule in self.rules: + includes = rule.aql_add_include(includes) + + include_fields = sorted(includes) + include_str = ", ".join(f'"{f}"' for f in include_fields) + print(f"AQL include fields: {include_str}") + filters_text = json.dumps(find_filters) - aql = f'{self.DOMAIN}.find({filters_text}).include("*", "property", "stat")' + aql = f'{self.DOMAIN}.find({filters_text}).include({include_str})' for rule in self.rules: before_aql = aql @@ -267,15 +285,45 @@ def _get_aql_text(self, find_filters: Dict) -> str: def get_artifacts(self) -> ArtifactsList: """ - Get artifacts from Artifactory by AQL filters that we collect from all rules in the policy + Get artifacts from Artifactory by AQL filters that we collect from all rules in the policy. + + By default all artifacts are fetched in a single request. Pass ``page_size`` to + ``CleanupPolicy`` to enable paginated fetching, which can avoid timeouts on large + repositories. + :return list of artifacts """ assert self.aql_text, "Call build_aql_query before calling get_artifacts" - r = self.session.post("/api/search/aql", data=self.aql_text) - r.raise_for_status() - content = r.json() - artifacts = content["results"] - return ArtifactsList.from_response(artifacts) + + if self.page_size is None: + r = self.session.post("/api/search/aql", data=self.aql_text) + r.raise_for_status() + content = r.json() + artifacts = content["results"] + return ArtifactsList.from_response(artifacts) + + all_artifacts: List[Dict] = [] + offset = 0 + + while True: + paginated_aql = ( + '{aql}.sort({{"$asc": ["created"]}}).offset({offset}).limit({limit})' + .format(aql=self.aql_text, offset=offset, limit=self.page_size) + ) + r = self.session.post("/api/search/aql", data=paginated_aql) + r.raise_for_status() + content = r.json() + page = content["results"] + all_artifacts.extend(page) + print( + f"Fetched page: {len(page)} artifacts (offset={offset}), " + f"total so far: {len(all_artifacts)}" + ) + if len(page) < self.page_size: + break + offset += self.page_size + + return ArtifactsList.from_response(all_artifacts) def filter(self, artifacts: ArtifactsList) -> ArtifactsList: """ diff --git a/artifactory_cleanup/rules/delete.py b/artifactory_cleanup/rules/delete.py index 981448b..5eab457 100644 --- a/artifactory_cleanup/rules/delete.py +++ b/artifactory_cleanup/rules/delete.py @@ -153,6 +153,11 @@ class DeleteLeastRecentlyUsedFiles(Rule): def __init__(self, keep: int): self.keep = keep + def aql_add_include(self, includes: set) -> set: + # Reads artifact["stats"]["downloaded"] via sort_by_usage() in filter() + includes.add("stat") + return includes + def filter(self, artifacts: ArtifactsList) -> ArtifactsList: # List will contain fresh files at the beginning artifacts.sort(key=utils.sort_by_usage, reverse=True) diff --git a/artifactory_cleanup/rules/keep.py b/artifactory_cleanup/rules/keep.py index a08c4f9..f9c879d 100644 --- a/artifactory_cleanup/rules/keep.py +++ b/artifactory_cleanup/rules/keep.py @@ -12,6 +12,11 @@ class KeepLatestNupkgNVersions(Rule): def __init__(self, count: int): self.count = count + def aql_add_include(self, includes: set) -> set: + # Reads nuget.id and nuget.version from artifact properties in filter() + includes.add("property") + return includes + def filter(self, artifacts): artifact_grouped = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) diff --git a/artifactory_cleanup/rules/repo.py b/artifactory_cleanup/rules/repo.py index dc7cf50..929591a 100644 --- a/artifactory_cleanup/rules/repo.py +++ b/artifactory_cleanup/rules/repo.py @@ -120,6 +120,11 @@ def __init__(self, property_key, property_value): self.property_key = property_key self.property_value = str(property_value) + def aql_add_include(self, includes: set) -> set: + # Reads artifact["properties"] in filter() + includes.add("property") + return includes + def filter(self, artifacts): good_artifact = [ x