Skip to content
Open
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,11 @@ repos:

# requirements-ml.txt
scikit-learn>=0.23.2,
keras>=2.4.3,
'keras>=2.4.3,<3.0.0',
rapidfuzz>=2.6.1,
tensorflow>=2.6.4; sys.platform != 'darwin',
tensorflow>=2.6.4; sys_platform == 'darwin' and platform_machine != 'arm64',
tensorflow-macos>=2.6.4; sys_platform == 'darwin' and platform_machine == 'arm64',
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
"tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
tqdm>=4.0.0,

# requirements-reports.txt
Expand Down
135 changes: 135 additions & 0 deletions dataprofiler/profilers/profiler_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import abc
import copy
import json
import re
import warnings
from typing import Any, Generic, TypeVar, cast
Expand Down Expand Up @@ -193,6 +194,15 @@ def __init__(self, is_enabled: bool = True) -> None:
"""
self.is_enabled = is_enabled

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:return: str of the option properties
:rtype: str
"""
return str(self.is_enabled)

def _validate_helper(self, variable_path: str = "BooleanOption") -> list[str]:
"""
Validate the options do not conflict and cause errors.
Expand Down Expand Up @@ -958,6 +968,25 @@ def __init__(
self.cms_relative_error = cms_relative_error
self.cms_max_num_heavy_hitters = cms_max_num_heavy_hitters

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"CategoricalOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["CategoricalOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

Comment on lines +971 to +989
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be good to find a way to abstract this a bit more so this ends up in BaseOption 90%+ of this code is repeat just with string changes: so I think there is room to make this DRY-er

def _validate_helper(self, variable_path: str = "CategoricalOptions") -> list[str]:
"""
Validate the options do not conflict and cause errors.
Expand Down Expand Up @@ -1182,6 +1211,25 @@ def __init__(
)
self.null_count: BooleanOption = BooleanOption(is_enabled=null_count)

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"RowStatisticsOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["RowStatisticsOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

def _validate_helper(
self, variable_path: str = "RowStatisticsOptions"
) -> list[str]:
Expand Down Expand Up @@ -1228,6 +1276,25 @@ def __init__(self) -> None:
self.max_sample_size: int | None = None
self.data_labeler_object: BaseDataLabeler | None = None

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"DataLabelerOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["DataLabelerOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

def __deepcopy__(self, memo: dict) -> DataLabelerOptions:
"""
Override deepcopy for data labeler object.
Expand Down Expand Up @@ -1370,6 +1437,25 @@ def __init__(
self.vocab: BooleanOption = BooleanOption(is_enabled=True)
self.words: BooleanOption = BooleanOption(is_enabled=True)

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"TextProfilerOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["TextProfilerOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

def _validate_helper(self, variable_path: str = "TextProfilerOptions") -> list[str]:
"""
Validate the options do not conflict and cause errors.
Expand Down Expand Up @@ -1488,6 +1574,25 @@ def __init__(
self.column_null_values = column_null_values
self.sampling_ratio = sampling_ratio

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"StructuredOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["StructuredOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

@property
def enabled_profiles(self) -> list[str]:
"""Return a list of the enabled profilers for columns."""
Expand Down Expand Up @@ -1638,6 +1743,25 @@ def __init__(self) -> None:
self.text = TextProfilerOptions()
self.data_labeler = DataLabelerOptions()

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:vartype dict_string: dict
:return: str of the option properties
:rtype: str
"""
dict_string: dict = {"UnstructuredOptions": []}
for iter_option in [
a
for a in dir(self)
if not a.startswith("__") and not callable(getattr(self, a))
]:
dict_string["UnstructuredOptions"].append(
{str(iter_option): str(getattr(self, iter_option))}
)
return json.dumps(dict_string, indent=4)

@property
def enabled_profiles(self) -> list[str]:
"""Return a list of the enabled profilers."""
Expand Down Expand Up @@ -1715,6 +1839,17 @@ def __init__(self, presets: str = None) -> None:
else:
raise ValueError("The preset entered is not a valid preset.")

def __str__(self) -> str:
"""
Return a human friendly consumable output in string form.

:return: str of the option presets and properties
:rtype: str
"""
return f"Presets: {str(self.presets)}\n \
{str(self.structured_options)}\n \
{str(self.unstructured_options)}"

Comment on lines +1842 to +1852
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this makes sense here and you would test in test_profiler_options.py

def _complete_presets(self) -> None:
self.set({"*.is_enabled": True})

Expand Down
8 changes: 4 additions & 4 deletions requirements-ml.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
scikit-learn>=0.23.2
keras>=2.4.3
keras>=2.4.3,<3.0.0
rapidfuzz>=2.6.1
tensorflow>=2.6.4; sys.platform != 'darwin'
tensorflow>=2.6.4; sys_platform == 'darwin' and platform_machine != 'arm64'
tensorflow-macos>=2.6.4; sys_platform == 'darwin' and platform_machine == 'arm64'
tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'
tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'
tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'
tqdm>=4.0.0
2 changes: 1 addition & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
coverage>=5.0.1
dask>=2.29.0
dask>=2.29.0,<2024.2.0
fsspec>=0.3.3
pytest>=6.0.1
pytest-cov>=2.8.1
Expand Down