-
Notifications
You must be signed in to change notification settings - Fork 244
Pretty Filestore #5484
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Pretty Filestore #5484
Changes from 9 commits
104da39
e78d38e
0d87796
cfdddc3
485b0bc
afac9ed
b229cd3
d5730f3
fdfe5f9
4853711
610a01a
1adeac3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,3 +35,4 @@ pytest-xdist | |
| build | ||
| check-jsonschema | ||
| strip_ansi==0.1.1 | ||
| edit_distance>=1.0.7,<2 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1088,18 +1088,60 @@ def jobs(self) -> Iterator[JobDescription]: | |
| # associated with a given job. | ||
| ########################################## | ||
|
|
||
| # Characters allowed in sanitized hints: these survive quote() unchanged | ||
| # and are safe in filesystem paths and S3 keys. | ||
| HINT_SAFE_RE = re.compile(r"[^a-zA-Z0-9_\-.]") | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| # Maximum length of a single sanitized hint path component. | ||
| MAX_HINT_LENGTH = 40 | ||
| # Maximum total length of the hints portion of the path (joined with /). | ||
| MAX_HINTS_PATH_LENGTH = 120 | ||
|
|
||
| def _sanitize_hints(self, hints: list[str] | None) -> list[str]: | ||
| """ | ||
| Turn user-supplied hints into path-safe components usable as | ||
| directory names on a filesystem or key segments in an object store. | ||
|
|
||
| Drops empty hints and hints that become empty after sanitization. | ||
| Truncates individual hints and the overall joined path to bounded | ||
| lengths so that the resulting file ID stays under a usable size. | ||
| """ | ||
| if not hints: | ||
| return [] | ||
| result: list[str] = [] | ||
| total_length = 0 | ||
| for hint in hints: | ||
| sanitized = self.HINT_SAFE_RE.sub("", hint) | ||
| sanitized = sanitized[: self.MAX_HINT_LENGTH] | ||
| if not sanitized: | ||
| continue | ||
| # +1 for the separator between components | ||
| if total_length + len(sanitized) + (1 if result else 0) > self.MAX_HINTS_PATH_LENGTH: | ||
| break | ||
| total_length += len(sanitized) + (1 if result else 0) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is fiddly and repetitive and the comment does not get a the real point here, which is to take hints until the separator and the next hint no longer fit under the limit, then stop. |
||
| result.append(sanitized) | ||
| return result | ||
|
|
||
| # Don't add any new arguments to this old version; make people use the new one! | ||
| @deprecated(new_function_name="write_file") | ||
| def writeFile( | ||
| self, | ||
| localFilePath: str, | ||
| jobStoreID: str | None = None, | ||
| cleanup: bool = False, | ||
| ) -> str: | ||
| return self.write_file(localFilePath, jobStoreID, cleanup) | ||
| return self.write_file( | ||
| localFilePath, | ||
| job_id=jobStoreID, | ||
| cleanup=cleanup | ||
| ) | ||
|
|
||
| @abstractmethod | ||
| def write_file( | ||
| self, local_path: str, job_id: str | None = None, cleanup: bool = False | ||
| self, | ||
| local_path: str, | ||
| job_id: str | None = None, | ||
| cleanup: bool = False, | ||
| hints: list[str] | None = None, | ||
| ) -> str: | ||
| """ | ||
| Takes a file (as a path) and places it in this job store. Returns an ID that can be used | ||
|
|
@@ -1118,6 +1160,15 @@ def write_file( | |
| whose jobStoreID was given as jobStoreID is deleted with | ||
| jobStore.delete(job). If jobStoreID was not given, does nothing. | ||
|
|
||
| :param hints: String values such as a workflow names or task names that | ||
| should be used to store the file at a human-findable location. | ||
| Two files with the same basename and same hints are still | ||
| guaranteed to never collide and to have distinct assigned IDs. | ||
| Large numbers of files stored under the same non-empty hints | ||
| may be inefficient, as slot allocation scans existing entries; | ||
| hints are intended for human-navigable categorization, not for | ||
| high-throughput bulk file creation. | ||
|
|
||
| :raise ConcurrentFileModificationException: if the file was modified concurrently during | ||
| an invocation of this method | ||
|
|
||
|
|
@@ -1131,6 +1182,7 @@ def write_file( | |
| """ | ||
| raise NotImplementedError() | ||
|
|
||
| # Don't add any new arguments to this old version; make people use the new one! | ||
| @deprecated(new_function_name="write_file_stream") | ||
| def writeFileStream( | ||
| self, | ||
|
|
@@ -1140,14 +1192,21 @@ def writeFileStream( | |
| encoding: str | None = None, | ||
| errors: str | None = None, | ||
| ) -> ContextManager[tuple[IO[bytes], str]]: | ||
| return self.write_file_stream(jobStoreID, cleanup, basename, encoding, errors) | ||
| return self.write_file_stream( | ||
| jobStoreID, | ||
| cleanup=cleanup, | ||
| basename=basename, | ||
| encoding=encoding, | ||
| errors=errors | ||
| ) | ||
|
|
||
| @abstractmethod | ||
| @contextmanager | ||
| def write_file_stream( | ||
| self, | ||
| job_id: str | None = None, | ||
| cleanup: bool = False, | ||
| hints: list[str] | None = None, | ||
| basename: str | None = None, | ||
| encoding: str | None = None, | ||
| errors: str | None = None, | ||
|
|
@@ -1171,6 +1230,15 @@ def write_file_stream( | |
| file basename so that when searching the job store with a query | ||
| matching that basename, the file will be detected. | ||
|
|
||
| :param hints: String values such as a workflow names or task names that | ||
| should be used to store the file at a human-findable location. | ||
| Two files with the same basename and same hints are still | ||
| guaranteed to never collide and to have distinct assigned IDs. | ||
| Large numbers of files stored under the same non-empty hints | ||
| may be inefficient, as slot allocation scans existing entries; | ||
| hints are intended for human-navigable categorization, not for | ||
| high-throughput bulk file creation. | ||
|
|
||
| :param str encoding: the name of the encoding used to encode the file. Encodings are the same | ||
| as for encode(). Defaults to None which represents binary mode. | ||
|
|
||
|
|
@@ -1189,22 +1257,28 @@ def write_file_stream( | |
| :rtype: Iterator[Tuple[IO[bytes], str]] | ||
| """ | ||
| raise NotImplementedError() | ||
|
|
||
|
|
||
| # Don't add any new arguments to this old version; make people use the new one! | ||
| @deprecated(new_function_name="get_empty_file_store_id") | ||
| def getEmptyFileStoreID( | ||
| self, | ||
| jobStoreID: str | None = None, | ||
| cleanup: bool = False, | ||
| basename: str | None = None, | ||
| ) -> str: | ||
| return self.get_empty_file_store_id(jobStoreID, cleanup, basename) | ||
| return self.get_empty_file_store_id( | ||
| job_id=jobStoreID, | ||
| cleanup=cleanup, | ||
| basename=basename | ||
| ) | ||
|
|
||
| @abstractmethod | ||
| def get_empty_file_store_id( | ||
| self, | ||
| job_id: str | None = None, | ||
| cleanup: bool = False, | ||
| basename: str | None = None, | ||
| hints: list[str] | None = None, | ||
| ) -> str: | ||
| """ | ||
| Creates an empty file in the job store and returns its ID. | ||
|
|
@@ -1221,6 +1295,15 @@ def get_empty_file_store_id( | |
| file basename so that when searching the job store with a query | ||
| matching that basename, the file will be detected. | ||
|
|
||
| :param hints: String values such as a workflow names or task names that | ||
| should be used to store the file at a human-findable location. | ||
| Two files with the same basename and same hints are still | ||
| guaranteed to never collide and to have distinct assigned IDs. | ||
| Large numbers of files stored under the same non-empty hints | ||
| may be inefficient, as slot allocation scans existing entries; | ||
| hints are intended for human-navigable categorization, not for | ||
| high-throughput bulk file creation. | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This gets repeated a lot, so in addition to this part being of dubious conceptual coherence, it might benefit from the parameter description being shorter overall, with the concept of hints explained in more detail elsewhere. |
||
|
|
||
| :return: a jobStoreFileID that references the newly created file and can be used to reference the | ||
| file in the future. | ||
| :rtype: str | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This sounds like word salad of limited explanatory value.