Finish consistency checks for recipes

datahub-project · shirshanka · Aug 8, 2021 · Jul 27, 2021 · Jul 27, 2021 · Jul 27, 2021
commit 9af3cab61a66fbee0b27de8df43ab7ada31f5fed
diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md
@@ -32,32 +32,33 @@ We use a plugin architecture so that you can install only the dependencies you a
 
 Sources:
 
-| Plugin Name | Install Command | Provides |
-| --------------------------------------------- | ---------------------------------------------------------- | ----------------------------------- |
-| [file](./source_docs/file.md) | _included by default_ | File source and sink |
-| [athena](./source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source |
-| [bigquery](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source |
-| [bigquery-usage](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source |
-| [dbt](./source_docs/dbt.md) | _no additional dependencies_ | dbt source |
-| [druid](./source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source |
-| [feast](./source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source |
-| [glue](./source_docs/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source |
-| [hive](./source_docs/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source |
-| [kafka](./source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source |
-| [ldap](./source_docs/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source |
-| [looker](./source_docs/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source |
-| [lookml](./source_docs/lookml.md) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ |
-| [mongodb](./source_docs/mongodb.md) | `pip install 'acryl-datahub[mongodb]'` | MongoDB source |
-| [mssql](./source_docs/mssql.md) | `pip install 'acryl-datahub[mssql]'` | SQL Server source |
-| [mysql](./source_docs/mysql.md) | `pip install 'acryl-datahub[mysql]'` | MySQL source |
-| [oracle](./source_docs/oracle.md) | `pip install 'acryl-datahub[oracle]'` | Oracle source |
-| [postgres](./source_docs/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source |
-| [redshift](./source_docs/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source |
-| [sagemaker](./source_docs/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source |
-| [snowflake](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source |
-| [snowflake-usage](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source |
-| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
-| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source |
+| Plugin Name | Install Command | Provides |
+| ----------------------------------------------- | ---------------------------------------------------------- | ----------------------------------- |
+| [file](./source_docs/file.md) | _included by default_ | File source and sink |
+| [athena](./source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source |
+| [bigquery](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source |
+| [bigquery-usage](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source |
+| [dbt](./source_docs/dbt.md) | _no additional dependencies_ | dbt source |
+| [druid](./source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source |
+| [feast](./source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source |
+| [glue](./source_docs/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source |
+| [hive](./source_docs/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source |
+| [kafka](./source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source |
+| [kafka-connect](./source_docs/kafka-connect.md) | `pip install 'acryl-datahub[kafka-connect]'` | Kafka connect source |
+| [ldap](./source_docs/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source |
+| [looker](./source_docs/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source |
+| [lookml](./source_docs/lookml.md) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ |
+| [mongodb](./source_docs/mongodb.md) | `pip install 'acryl-datahub[mongodb]'` | MongoDB source |
+| [mssql](./source_docs/mssql.md) | `pip install 'acryl-datahub[mssql]'` | SQL Server source |
+| [mysql](./source_docs/mysql.md) | `pip install 'acryl-datahub[mysql]'` | MySQL source |
+| [oracle](./source_docs/oracle.md) | `pip install 'acryl-datahub[oracle]'` | Oracle source |
+| [postgres](./source_docs/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source |
+| [redshift](./source_docs/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source |
+| [sagemaker](./source_docs/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source |
+| [snowflake](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source |
+| [snowflake-usage](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source |
+| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
+| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source |
 
 Sinks
 

diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md
@@ -95,6 +95,21 @@ source:
  top_n_queries: 10 # number of queries to save for each table
 
  env: PROD
+
+ # Additional options to pass to google.cloud.logging_v2.client.Client
+ extra_client_options:
+
+ # To account for the possibility that the query event arrives after
+ # the read event in the audit logs, we wait for at least `query_log_delay`
+ # additional events to be processed before attempting to resolve BigQuery
+ # job information from the logs. If `query_log_delay` is None, it gets treated
+ # as an unlimited delay, which prioritizes correctness at the expense of memory usage.
+ query_log_delay:
+
+ # Correction to pad start_time and end_time with.
+ # For handling the case where the read happens within our time range but the query
+ # completion event is delayed and happens after the configured end time.
+ max_query_duration:
 ```
 
 :::note

diff --git a/metadata-ingestion/source_docs/kafka-connect.md b/metadata-ingestion/source_docs/kafka-connect.md
@@ -0,0 +1,24 @@
+# Kafka Connect `kafka-connect`
+
+This plugin extracts the following:
+
+- Kafka Connect connector as individual `DataFlowSnapshotClass` entity
+- Creating individual `DataJobSnapshotClass` entity using `{connector_name}:{source_dataset}` naming
+- Lineage information between source database to Kafka topic
+
+```yml
+source:
+ type: "kafka-connect"
+ config:
+ connect_uri: "https://localhost:8083"
+ cluster_name: "connect-cluster"
+ connector_patterns:
+ deny:
+ - ^denied-connector.*
+ allow:
+ - ^allowed-connector.*
+```
+
+Current limitations:
+
+- Currently works only for Debezium source connectors.
diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/source_docs/kafka.md
@@ -36,28 +36,3 @@ source:
 The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively.
 
 For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml).
-
-# Kafka Connect `kafka-connect`
-
-This plugin extracts the following:
-
-- Kafka Connect connector as individual `DataFlowSnapshotClass` entity
-- Creating individual `DataJobSnapshotClass` entity using `{connector_name}:{source_dataset}` naming
-- Lineage information between source database to Kafka topic
-
-```yml
-source:
- type: "kafka-connect"
- config:
- connect_uri: "https://localhost:8083"
- cluster_name: "connect-cluster"
- connector_patterns:
- deny:
- - ^denied-connector.*
- allow:
- - ^allowed-connector.*
-```
-
-Current limitations:
-
-- Currently works only for Debezium source connectors.
diff --git a/metadata-ingestion/source_docs/ldap.md b/metadata-ingestion/source_docs/ldap.md
@@ -15,9 +15,16 @@ source:
  ldap_server: ldap:https://localhost
  ldap_user: "cn=admin,dc=example,dc=org"
  ldap_password: "admin"
+
+ # Extraction configuration.
  base_dn: "dc=example,dc=org"
  filter: "(objectClass=*)" # optional field
+
+ # If set to true, any users without first and last names will be dropped.
  drop_missing_first_last_name: False # optional
+
+ # For creating LDAP controls
+ page_size: # default is 20
 ```
 
 The `drop_missing_first_last_name` should be set to true if you've got many "headless" user LDAP accounts

diff --git a/metadata-ingestion/source_docs/looker.md b/metadata-ingestion/source_docs/looker.md
@@ -16,9 +16,23 @@ source:
  client_id: # Your Looker API3 client ID
  client_secret: # Your Looker API3 client secret
  base_url: # The url to your Looker instance: https://company.looker.com:19999 or https://looker.company.com, or similar.
- dashboard_pattern: # supports allow/deny regexes
- chart_pattern: # supports allow/deny regexes
+
+ platform_name: "looker" # Optional, default is "looker"
  actor: urn:li:corpuser:etl # Optional, defaults to urn:li:corpuser:etl
+
+ # regex pattern to allow/deny dashboards
+ dashboard_pattern:
+ deny:
+ # ...
+ allow:
+ # ...
+
+ # regex pattern to allow/deny charts
+ chart_pattern:
+ deny:
+ # ...
+ allow:
+ # ...
+
  env: "PROD" # Optional, default is "PROD"
- platform_name: "looker" # Optional, default is "looker"
 ```
diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/source_docs/lookml.md
@@ -16,11 +16,25 @@ source:
  base_folder: /path/to/model/files # where the *.model.lkml and *.view.lkml files are stored
  connection_to_platform_map: # mappings between connection names in the model files to platform names
  connection_name: platform_name (or platform_name.database_name) # for ex. my_snowflake_conn: snowflake.my_database
- model_pattern: {}
- view_pattern: {}
+
+ platform_name: "looker" # optional, default is "looker"
+
+ # regex pattern to allow/deny models
+ model_pattern:
+ deny:
+ # ...
+ allow:
+ # ...
+
+ # regex pattern to allow/deny views
+ view_pattern:
+ deny:
+ # ...
+ allow:
+ # ...
+
  env: "PROD" # optional, default is "PROD"
  parse_table_names_from_sql: False # see note below
- platform_name: "looker" # optional, default is "looker"
 ```
 
 Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-metadata/) to try to parse the tables the

diff --git a/metadata-ingestion/source_docs/mongodb.md b/metadata-ingestion/source_docs/mongodb.md
@@ -21,13 +21,26 @@ source:
  connect_uri: "mongodb:https://localhost"
  username: admin
  password: password
- env: "PROD" # Optional, default is "PROD"
+ # used for PyMongo
  authMechanism: "DEFAULT"
- options: {}
- database_pattern: {}
- collection_pattern: {}
+
+ options: {} # kwargs to pass to pymongo.MongoClient
  enableSchemaInference: True
- schemaSamplingSize: 1000
+ schemaSamplingSize: 1000 # number of samples for determining schema
  useRandomSampling: True # whether to randomly sample docs for schema or just use the first ones, True by default
- # database_pattern/collection_pattern are similar to schema_pattern/table_pattern from above
+
+ env: "PROD" # Optional, default is "PROD"
+
+ # regex pattern to allow/deny databases
+ database_pattern:
+ deny:
+ # ...
+ allow:
+ # ...
+ # regex pattern to allow/deny collections
+ collection_pattern:
+ deny:
+ # ...
+ allow:
+ # ...
 ```
diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md
@@ -63,6 +63,7 @@ source:
  # already installed the Microsoft ODBC Driver for SQL Server.
  # See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15
  use_odbc: False
+ # args URL-encode and append to the mssql connection URL
  uri_args: {}
 ```
 

diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md
@@ -14,14 +14,7 @@ source:
  username: user
  password: pass
  host_port: account_name
- database_pattern:
- # The escaping of the $ symbol helps us skip the environment variable substitution.
- allow:
- - ^MY_DEMO_DATA.*
- - ^ANOTHER_DB_REGEX
- deny:
- - ^SNOWFLAKE\$
- - ^SNOWFLAKE_SAMPLE_DATA\$
+
  warehouse: "COMPUTE_WH" # optional
  role: "sysadmin" # optional
 
@@ -33,6 +26,16 @@ source:
  options:
  # driver_option: some-option
 
+ # Regexe filters for databases to allow/deny
+ database_pattern:
+ # The escaping of the $ symbol helps us skip the environment variable substitution.
+ allow:
+ - ^MY_DEMO_DATA.*
+ - ^ANOTHER_DB_REGEX
+ deny:
+ - ^SNOWFLAKE\$
+ - ^SNOWFLAKE_SAMPLE_DATA\$
+
  # Tables to allow/deny
  table_pattern:
  deny:

diff --git a/metadata-ingestion/source_docs/superset.md b/metadata-ingestion/source_docs/superset.md
@@ -10,10 +10,12 @@ This plugin extracts the following:
 source:
  type: superset
  config:
+ connect_uri: https://localhost:8088
+
  username: user
  password: pass
  provider: db | ldap
- connect_uri: https://localhost:8088
+
  env: "PROD" # Optional, default is "PROD"
 ```