-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(ingest/bigquery): support column-level lineage #8382
Changes from 1 commit
4de6350
2581742
07e27cb
d80842d
ee0441a
6768f1a
38970e7
8f50043
7378e17
cd2015e
352d8ed
a467472
c0e6124
786ce33
87eeb9d
0a76a9b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import atexit | ||
import hashlib | ||
import logging | ||
import os | ||
import re | ||
|
@@ -112,6 +113,7 @@ | |
TagAssociationClass, | ||
) | ||
from datahub.specific.dataset import DatasetPatchBuilder | ||
from datahub.utilities.file_backed_collections import FileBackedDict | ||
from datahub.utilities.hive_schema_to_avro import ( | ||
HiveColumnToAvroConverter, | ||
get_schema_fields_for_hive_column, | ||
|
@@ -138,6 +140,10 @@ def cleanup(config: BigQueryV2Config) -> None: | |
os.unlink(config._credentials_path) | ||
|
||
|
||
def _generate_sql_id(sql: str) -> str: | ||
return hashlib.md5(sql.encode("utf-8")).hexdigest() | ||
|
||
|
||
@platform_name("BigQuery", doc_order=1) | ||
@config_class(BigQueryV2Config) | ||
@support_status(SupportStatus.CERTIFIED) | ||
|
@@ -254,8 +260,12 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): | |
|
||
# Global store of table identifiers for lineage filtering | ||
self.table_refs: Set[str] = set() | ||
# Maps project -> view_ref -> view definition (will be used when generating lineage) | ||
self.view_definitions: Dict[str, Dict[str, str]] = defaultdict(dict) | ||
|
||
# We do this so that the SQL is stored in a file-backed dict, but the sql IDs are stored in memory. | ||
# Maps project -> view_ref -> sql ID (will be used when generating lineage) | ||
self.view_definition_ids: Dict[str, Dict[str, str]] = defaultdict(dict) | ||
# Maps sql ID -> actual sql | ||
self.view_definitions: FileBackedDict[str] = FileBackedDict() | ||
|
||
self.sql_parser_schema_resolver = SchemaResolver( | ||
platform=self.platform, env=self.config.env | ||
|
@@ -666,7 +676,10 @@ def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]: | |
) | ||
|
||
if self.config.lineage_parse_view_ddl: | ||
for view, view_definition in self.view_definitions[project_id].items(): | ||
for view, view_definition_id in self.view_definition_ids[ | ||
project_id | ||
].items(): | ||
view_definition = self.view_definitions[view_definition_id] | ||
raw_view_lineage = sqlglot_lineage( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we think this sql parser is strictly better than the old one? A fallback to the old one might be safer, but honestly I would like to err on the side of code velocity over safety (in cases like these) so I am fine with this as is EDIT: That being said, a function like below might be nice, in case we support multiple parsers or adjust the call signature of sqlglot_lineage:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I initially wanted to do that However the Ideally we create a BigqueryContext object with the schema resolver, urn generation utils, etc and pass that around everywhere |
||
view_definition, | ||
platform=self.platform, | ||
|
@@ -871,7 +884,9 @@ def _process_view( | |
) | ||
self.table_refs.add(table_ref) | ||
if self.config.lineage_parse_view_ddl: | ||
self.view_definitions[project_id][table_ref] = view.view_definition | ||
view_definition_id = _generate_sql_id(view.view_definition) | ||
self.view_definition_ids[project_id][table_ref] = view_definition_id | ||
self.view_definitions[view_definition_id] = view.view_definition | ||
|
||
view.column_count = len(columns) | ||
if not view.column_count: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we do
view_definitions: view ref -> sql
and then a mapproject_id -> view ref
instead to avoid the hashing?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that makes sense - will do in a follow up PR