gentle-lifeguard-88494
02/15/2023, 11:55 PM@_run_with_query_combiner
def _get_dataset_column_distinct_values(
self, column_profile: DatasetFieldProfileClass, column: str, unique_count: int, nonnull_count: int
) -> None:
if not self.config.include_field_distinct_values or unique_count > 25:
return
try:
# TODO do this without GE
self.dataset.set_config_value("interactive_evaluation", True)
# Check for distinct values in ever larger increments
pct_dataset = [.01,.05,.10,.25,.5,1]
for pct in pct_dataset:
samples_to_check = nonnull_count * pct
res = self.dataset.expect_column_values_to_be_in_set(
column,
[],
result_format={
"result_format": "SUMMARY",
"partial_unexpected_count": samples_to_check,
},
).result
# Get the distinct values
distinct_values = [*set(res["partial_unexpected_list"])]
if len(distinct_values) == unique_count:
column_profile.distinctValues = [
str(v) for v in res["partial_unexpected_list"]
]
# Exit loop if the distinct values are all captured
break
except Exception as e:
logger.debug(
f"Caught exception while attempting to get distinct values for column {column}. {e}"
)
self.report.report_warning(
"Profiling - Unable to get column distinct values",
f"{self.dataset_name}.{column}",
)
gentle-lifeguard-88494
02/16/2023, 6:13 AM