Skip to content

Commit

Permalink
fix(ingest/sql-common): Fix profile_table_level_only (datahub-project…
Browse files Browse the repository at this point in the history
…#8331)

Co-authored-by: Aseem Bansal <[email protected]>
Co-authored-by: Tamas Nemeth <[email protected]>
Co-authored-by: Harshal Sheth <[email protected]>
  • Loading branch information
4 people authored Jul 7, 2023
1 parent 8617e07 commit 1f84bf5
Show file tree
Hide file tree
Showing 10 changed files with 477 additions and 102 deletions.
4 changes: 4 additions & 0 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
- #8263: Okta source config option `okta_profile_to_username_attr` default changed from `login` to `email`.
This determines which Okta profile attribute is used for the corresponding DataHub user
and thus may change what DataHub users are generated by the Okta source. And in a follow up `okta_profile_to_username_regex` has been set to `.*` which taken together with previous change brings the defaults in line with OIDC.
- #8331: For all sql-based sources that support profiling, you can no longer specify
`profile_table_level_only` together with `include_field_xyz` config options to ingest
certain column-level metrics. Instead, set `profile_table_level_only` to `false` and
individually enable / disable desired field metrics.

### Potential Downtime

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -579,16 +579,17 @@ def generate_dataset_profile( # noqa: C901 (complexity)
self.query_combiner.flush()

columns_profiling_queue: List[_SingleColumnSpec] = []
for column in all_columns:
column_profile = DatasetFieldProfileClass(fieldPath=column)
profile.fieldProfiles.append(column_profile)
if columns_to_profile:
for column in all_columns:
column_profile = DatasetFieldProfileClass(fieldPath=column)
profile.fieldProfiles.append(column_profile)

if column in columns_to_profile:
column_spec = _SingleColumnSpec(column, column_profile)
columns_profiling_queue.append(column_spec)
if column in columns_to_profile:
column_spec = _SingleColumnSpec(column, column_profile)
columns_profiling_queue.append(column_spec)

self._get_column_type(column_spec, column)
self._get_column_cardinality(column_spec, column)
self._get_column_type(column_spec, column)
self._get_column_cardinality(column_spec, column)

logger.debug(f"profiling {self.dataset_name}: flushing stage 2 queries")
self.query_combiner.flush()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def deprecate_bigquery_temp_table_schema(cls, values):
del values["bigquery_temp_table_schema"]
return values

@pydantic.root_validator()
@pydantic.root_validator(pre=True)
def ensure_field_level_settings_are_normalized(
cls: "GEProfilingConfig", values: Dict[str, Any]
) -> Dict[str, Any]:
Expand All @@ -167,7 +167,11 @@ def ensure_field_level_settings_are_normalized(
if values.get("profile_table_level_only"):
for field_level_metric in cls.__fields__:
if field_level_metric.startswith("include_field_"):
values.setdefault(field_level_metric, False)
if values.get(field_level_metric):
raise ValueError(
"Cannot enable field-level metrics if profile_table_level_only is set"
)
values[field_level_metric] = False

assert (
max_num_fields_to_profile is None
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
source:
type: mysql
config:
username: root
password: example
host_port: localhost:53307
database: northwind
profiling:
enabled: True
profile_table_level_only: true
sink:
type: file
config:
filename: "./mysql_mces.json"
Loading

0 comments on commit 1f84bf5

Please sign in to comment.