From 0b2f343a7b02b65f5ca301a7ee7fcf8c41458aee Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 10:33:30 -0700 Subject: [PATCH 01/33] Begin reorg --- metadata-ingestion/README.md | 751 ------------------- metadata-ingestion/source_docs/athena.md | 23 + metadata-ingestion/source_docs/bigquery.md | 63 ++ metadata-ingestion/source_docs/datahub.md | 0 metadata-ingestion/source_docs/dbt.md | 39 + metadata-ingestion/source_docs/druid.md | 22 + metadata-ingestion/source_docs/feast.md | 22 + metadata-ingestion/source_docs/file.md | 12 + metadata-ingestion/source_docs/glue.md | 31 + metadata-ingestion/source_docs/hive.md | 47 ++ metadata-ingestion/source_docs/kafka.md | 47 ++ metadata-ingestion/source_docs/ldap.md | 23 + metadata-ingestion/source_docs/looker.md | 22 + metadata-ingestion/source_docs/lookml.md | 27 + metadata-ingestion/source_docs/mongodb.md | 31 + metadata-ingestion/source_docs/mssql.md | 66 ++ metadata-ingestion/source_docs/mysql.md | 31 + metadata-ingestion/source_docs/oracle.md | 25 + metadata-ingestion/source_docs/postgres.md | 23 + metadata-ingestion/source_docs/redshift.md | 41 + metadata-ingestion/source_docs/sagemaker.md | 34 + metadata-ingestion/source_docs/snowflake.md | 68 ++ metadata-ingestion/source_docs/sqlalchemy.md | 22 + metadata-ingestion/source_docs/superset.md | 19 + 24 files changed, 738 insertions(+), 751 deletions(-) create mode 100644 metadata-ingestion/source_docs/athena.md create mode 100644 metadata-ingestion/source_docs/bigquery.md create mode 100644 metadata-ingestion/source_docs/datahub.md create mode 100644 metadata-ingestion/source_docs/dbt.md create mode 100644 metadata-ingestion/source_docs/druid.md create mode 100644 metadata-ingestion/source_docs/feast.md create mode 100644 metadata-ingestion/source_docs/file.md create mode 100644 metadata-ingestion/source_docs/glue.md create mode 100644 metadata-ingestion/source_docs/hive.md create mode 100644 metadata-ingestion/source_docs/kafka.md create mode 100644 metadata-ingestion/source_docs/ldap.md create mode 100644 metadata-ingestion/source_docs/looker.md create mode 100644 metadata-ingestion/source_docs/lookml.md create mode 100644 metadata-ingestion/source_docs/mongodb.md create mode 100644 metadata-ingestion/source_docs/mssql.md create mode 100644 metadata-ingestion/source_docs/mysql.md create mode 100644 metadata-ingestion/source_docs/oracle.md create mode 100644 metadata-ingestion/source_docs/postgres.md create mode 100644 metadata-ingestion/source_docs/redshift.md create mode 100644 metadata-ingestion/source_docs/sagemaker.md create mode 100644 metadata-ingestion/source_docs/snowflake.md create mode 100644 metadata-ingestion/source_docs/sqlalchemy.md create mode 100644 metadata-ingestion/source_docs/superset.md diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 92eaca1b702c5..7d72a70bd0fdd 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -138,757 +138,6 @@ datahub ingest -c ./examples/recipes/mssql_to_datahub.yml A number of recipes are included in the examples/recipes directory. -## Sources - -### Kafka Metadata `kafka` - -Extracts: - -- List of topics - from the Kafka broker -- Schemas associated with each topic - from the schema registry - -```yml -source: - type: "kafka" - config: - connection: - bootstrap: "broker:9092" - consumer_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.DeserializingConsumer - schema_registry_url: http://localhost:8081 - schema_registry_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.schema_registry.SchemaRegistryClient -``` - -The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively. - -For a full example with a number of security options, see this [example recipe](./examples/recipes/secured_kafka.yml). - -### MySQL Metadata `mysql` - -Extracts: - -- List of databases and tables -- Column types and schema associated with each table - -```yml -source: - type: mysql - config: - username: root - password: example - database: dbname - host_port: localhost:3306 - table_pattern: - deny: - # Note that the deny patterns take precedence over the allow patterns. - - "performance_schema" - allow: - - "schema1.table2" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. - schema_pattern: - deny: - - "garbage_schema" - allow: - - "schema1" -``` - -### Microsoft SQL Server Metadata `mssql` - -We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. - -Extracts: - -- List of databases, schema, tables and views -- Column types associated with each table/view - -```yml -source: - type: mssql - config: - username: user - password: pass - host_port: localhost:1433 - database: DemoDatabase - include_views: True # whether to include views, defaults to True - table_pattern: - deny: - - "^.*\\.sys_.*" # deny all tables that start with sys_ - allow: - - "schema1.table1" - - "schema1.table2" - options: - # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. - # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. - # Many of these options are specific to the underlying database driver, so that library's - # documentation will be a good reference for what is supported. To find which dialect is likely - # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. - charset: "utf8" - # If set to true, we'll use the pyodbc library. This requires you to have - # already installed the Microsoft ODBC Driver for SQL Server. - # See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 - use_odbc: False - uri_args: {} -``` - -
- Example: using ingestion with ODBC and encryption - -This requires you to have already installed the Microsoft ODBC Driver for SQL Server. -See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 - -```yml -source: - type: mssql - config: - # See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc - use_odbc: True - username: user - password: pass - host_port: localhost:1433 - database: DemoDatabase - include_views: True # whether to include views, defaults to True - uri_args: - # See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15 - driver: "ODBC Driver 17 for SQL Server" - Encrypt: "yes" - TrustServerCertificate: "Yes" - ssl: "True" - # Trusted_Connection: "yes" -``` - -
- -### Hive `hive` - -Extracts: - -- List of databases, schema, and tables -- Column types associated with each table -- Detailed table and storage information - -```yml -source: - type: hive - config: - # For more details on authentication, see the PyHive docs: - # https://github.com/dropbox/PyHive#passing-session-configuration. - # LDAP, Kerberos, etc. are supported using connect_args, which can be - # added under the `options` config parameter. - #scheme: 'hive+http' # set this if Thrift should use the HTTP transport - #scheme: 'hive+https' # set this if Thrift should use the HTTP with SSL transport - username: user # optional - password: pass # optional - host_port: localhost:10000 - database: DemoDatabase # optional, defaults to 'default' - # table_pattern/schema_pattern is same as above - # options is same as above -``` - -
- Example: using ingestion with Azure HDInsight - -```yml -# Connecting to Microsoft Azure HDInsight using TLS. -source: - type: hive - config: - scheme: "hive+https" - host_port: .azurehdinsight.net:443 - username: admin - password: "" - options: - connect_args: - http_path: "/hive2" - auth: BASIC - # table_pattern/schema_pattern is same as above -``` - -
- -### PostgreSQL `postgres` - -Extracts: - -- List of databases, schema, and tables -- Column types associated with each table -- Also supports PostGIS extensions -- database_alias (optional) can be used to change the name of database to be ingested - -```yml -source: - type: postgres - config: - username: user - password: pass - host_port: localhost:5432 - database: DemoDatabase - database_alias: DatabaseNameToBeIngested - include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above - # options is same as above -``` - -### Redshift `redshift` - -Extracts: - -- List of databases, schema, and tables -- Column types associated with each table -- Also supports PostGIS extensions - -```yml -source: - type: redshift - config: - username: user - password: pass - host_port: example.something.us-west-2.redshift.amazonaws.com:5439 - database: DemoDatabase - include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above - # options is same as above -``` - -
- Extra options when running Redshift behind a proxy - -This requires you to have already installed the Microsoft ODBC Driver for SQL Server. -See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 - -```yml -source: - type: redshift - config: - # username, password, database, etc are all the same as above - host_port: my-proxy-hostname:5439 - options: - connect_args: - sslmode: "prefer" # or "require" or "verify-ca" - sslrootcert: ~ # needed to unpin the AWS Redshift certificate -``` - -
- -### AWS SageMaker `sagemaker` - -Extracts: - -- Feature groups -- Models, jobs, and lineage between the two (e.g. when jobs output a model or a model is used by a job) - -```yml -source: - type: sagemaker - config: - aws_region: # aws_region_name, i.e. "eu-west-1" - env: # environment for the DatasetSnapshot URN, one of "DEV", "EI", "PROD" or "CORP". Defaults to "PROD". - - # Credentials. If not specified here, these are picked up according to boto3 rules. - # (see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html) - aws_access_key_id: # Optional. - aws_secret_access_key: # Optional. - aws_session_token: # Optional. - aws_role: # Optional (Role chaining supported by using a sorted list). - - extract_feature_groups: True # if feature groups should be ingested, default True - extract_models: True # if models should be ingested, default True - extract_jobs: # if jobs should be ingested, default True for all - auto_ml: True - compilation: True - edge_packaging: True - hyper_parameter_tuning: True - labeling: True - processing: True - training: True - transform: True -``` - -### Snowflake `snowflake` - -Extracts: - -- List of databases, schema, and tables -- Column types associated with each table - -```yml -source: - type: snowflake - config: - username: user - password: pass - host_port: account_name - database_pattern: - # The escaping of the $ symbol helps us skip the environment variable substitution. - allow: - - ^MY_DEMO_DATA.* - - ^ANOTHER_DB_REGEX - deny: - - ^SNOWFLAKE\$ - - ^SNOWFLAKE_SAMPLE_DATA\$ - warehouse: "COMPUTE_WH" # optional - role: "sysadmin" # optional - include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above - # options is same as above -``` - -:::tip - -You can also get fine-grained usage statistics for Snowflake using the `snowflake-usage` source. - -::: - -### Superset `superset` - -Extracts: - -- List of charts and dashboards - -```yml -source: - type: superset - config: - username: user - password: pass - provider: db | ldap - connect_uri: http://localhost:8088 - env: "PROD" # Optional, default is "PROD" -``` - -See documentation for superset's `/security/login` at https://superset.apache.org/docs/rest-api for more details on superset's login api. - -### Oracle `oracle` - -Extracts: - -- List of databases, schema, and tables -- Column types associated with each table - -Using the Oracle source requires that you've also installed the correct drivers; see the [cx_Oracle docs](https://cx-oracle.readthedocs.io/en/latest/user_guide/installation.html). The easiest one is the [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client.html). - -```yml -source: - type: oracle - config: - # For more details on authentication, see the documentation: - # https://docs.sqlalchemy.org/en/14/dialects/oracle.html#dialect-oracle-cx_oracle-connect and - # https://cx-oracle.readthedocs.io/en/latest/user_guide/connection_handling.html#connection-strings. - username: user - password: pass - host_port: localhost:5432 - database: dbname - service_name: svc # omit database if using this option - include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above - # options is same as above -``` - -### Feast `feast` - -**Note: Feast ingestion requires Docker to be installed.** - -Extracts: - -- List of feature tables (modeled as [`MLFeatureTable`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureTableProperties.pdl)s), - features ([`MLFeature`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl)s), - and entities ([`MLPrimaryKey`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLPrimaryKeyProperties.pdl)s) -- Column types associated with each feature and entity - -Note: this uses a separate Docker container to extract Feast's metadata into a JSON file, which is then -parsed to DataHub's native objects. This was done because of a dependency conflict in the `feast` module. - -```yml -source: - type: feast - config: - core_url: localhost:6565 # default - env: "PROD" # Optional, default is "PROD" - use_local_build: False # Whether to build Feast ingestion image locally, default is False -``` - -### Google BigQuery `bigquery` - -Extracts: - -- List of databases, schema, and tables -- Column types associated with each table - -```yml -source: - type: bigquery - config: - project_id: project # optional - can autodetect from environment - options: # options is same as above - # See https://github.com/mxmzdlv/pybigquery#authentication for details. - credentials_path: "/path/to/keyfile.json" # optional - include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above -``` - -:::tip - -You can also get fine-grained usage statistics for BigQuery using the `bigquery-usage` source. - -::: - -### AWS Athena `athena` - -Extracts: - -- List of databases and tables -- Column types associated with each table - -```yml -source: - type: athena - config: - username: aws_access_key_id # Optional. If not specified, credentials are picked up according to boto3 rules. - # See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html - password: aws_secret_access_key # Optional. - database: database # Optional, defaults to "default" - aws_region: aws_region_name # i.e. "eu-west-1" - s3_staging_dir: s3_location # "s3:///prefix/" - # The s3_staging_dir parameter is needed because Athena always writes query results to S3. - # See https://docs.aws.amazon.com/athena/latest/ug/querying.html - # However, the athena driver will transparently fetch these results as you would expect from any other sql client. - work_group: athena_workgroup # "primary" - # table_pattern/schema_pattern is same as above -``` - -### AWS Glue `glue` - -Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](./s3-ingestion.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. - -Extracts: - -- List of tables -- Column types associated with each table -- Table metadata, such as owner, description and parameters -- Jobs and their component transformations, data sources, and data sinks - -```yml -source: - type: glue - config: - aws_region: # aws_region_name, i.e. "eu-west-1" - extract_transforms: True # whether to ingest Glue jobs, defaults to True - env: # environment for the DatasetSnapshot URN, one of "DEV", "EI", "PROD" or "CORP". Defaults to "PROD". - - # Filtering patterns for databases and tables to scan - database_pattern: # Optional, to filter databases scanned, same as schema_pattern above. - table_pattern: # Optional, to filter tables scanned, same as table_pattern above. - - # Credentials. If not specified here, these are picked up according to boto3 rules. - # (see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html) - aws_access_key_id: # Optional. - aws_secret_access_key: # Optional. - aws_session_token: # Optional. - aws_role: # Optional (Role chaining supported by using a sorted list). -``` - -### Druid `druid` - -Extracts: - -- List of databases, schema, and tables -- Column types associated with each table - -**Note** It is important to define a explicitly define deny schema pattern for internal druid databases (lookup & sys) -if adding a schema pattern otherwise the crawler may crash before processing relevant databases. -This deny pattern is defined by default but is overriden by user-submitted configurations - -```yml -source: - type: druid - config: - # Point to broker address - host_port: localhost:8082 - schema_pattern: - deny: - - "^(lookup|sys).*" - # options is same as above -``` - -### Other databases using SQLAlchemy `sqlalchemy` - -The `sqlalchemy` source is useful if we don't have a pre-built source for your chosen -database system, but there is an [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/14/dialects/) -defined elsewhere. In order to use this, you must `pip install` the required dialect packages yourself. - -Extracts: - -- List of schemas and tables -- Column types associated with each table - -```yml -source: - type: sqlalchemy - config: - # See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls - connect_uri: "dialect+driver://username:password@host:port/database" - options: {} # same as above - schema_pattern: {} # same as above - table_pattern: {} # same as above - include_views: True # whether to include views, defaults to True -``` - -### MongoDB `mongodb` - -Extracts: - -- List of databases -- List of collections in each database and infers schemas for each collection - -By default, schema inference samples 1,000 documents from each collection. Setting `schemaSamplingSize: null` will scan the entire collection. -Moreover, setting `useRandomSampling: False` will sample the first documents found without random selection, which may be faster for large collections. - -Note that `schemaSamplingSize` has no effect if `enableSchemaInference: False` is set. - -```yml -source: - type: "mongodb" - config: - # For advanced configurations, see the MongoDB docs. - # https://pymongo.readthedocs.io/en/stable/examples/authentication.html - connect_uri: "mongodb://localhost" - username: admin - password: password - env: "PROD" # Optional, default is "PROD" - authMechanism: "DEFAULT" - options: {} - database_pattern: {} - collection_pattern: {} - enableSchemaInference: True - schemaSamplingSize: 1000 - useRandomSampling: True # whether to randomly sample docs for schema or just use the first ones, True by default - # database_pattern/collection_pattern are similar to schema_pattern/table_pattern from above -``` - -### LDAP `ldap` - -Extracts: - -- List of people -- Names, emails, titles, and manager information for each person -- List of groups - -```yml -source: - type: "ldap" - config: - ldap_server: ldap://localhost - ldap_user: "cn=admin,dc=example,dc=org" - ldap_password: "admin" - base_dn: "dc=example,dc=org" - filter: "(objectClass=*)" # optional field - drop_missing_first_last_name: False # optional -``` - -The `drop_missing_first_last_name` should be set to true if you've got many "headless" user LDAP accounts -for devices or services should be excluded when they do not contain a first and last name. This will only -impact the ingestion of LDAP users, while LDAP groups will be unaffected by this config option. - -### LookML `lookml` - -Note! This plugin uses a package that requires Python 3.7+! - -Extracts: - -- LookML views from model files -- Name, upstream table names, dimensions, measures, and dimension groups - -```yml -source: - type: "lookml" - config: - base_folder: /path/to/model/files # where the *.model.lkml and *.view.lkml files are stored - connection_to_platform_map: # mappings between connection names in the model files to platform names - connection_name: platform_name (or platform_name.database_name) # for ex. my_snowflake_conn: snowflake.my_database - model_pattern: {} - view_pattern: {} - env: "PROD" # optional, default is "PROD" - parse_table_names_from_sql: False # see note below - platform_name: "looker" # optional, default is "looker" -``` - -Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-metadata/) to try to parse the tables the -views depends on. As these SQL's can be complicated, and the package doesn't official support all the SQL dialects that -Looker supports, the result might not be correct. This parsing is disabled by default, but can be enabled by setting -`parse_table_names_from_sql: True`. - -### Looker dashboards `looker` - -Extracts: - -- Looker dashboards and dashboard elements (charts) -- Names, descriptions, URLs, chart types, input view for the charts - -See the [Looker authentication docs](https://docs.looker.com/reference/api-and-integration/api-auth#authentication_with_an_sdk) for the steps to create a client ID and secret. - -```yml -source: - type: "looker" - config: - client_id: # Your Looker API3 client ID - client_secret: # Your Looker API3 client secret - base_url: # The url to your Looker instance: https://company.looker.com:19999 or https://looker.company.com, or similar. - dashboard_pattern: # supports allow/deny regexes - chart_pattern: # supports allow/deny regexes - actor: urn:li:corpuser:etl # Optional, defaults to urn:li:corpuser:etl - env: "PROD" # Optional, default is "PROD" - platform_name: "looker" # Optional, default is "looker" -``` - -### File `file` - -Pulls metadata from a previously generated file. Note that the file sink -can produce such files, and a number of samples are included in the -[examples/mce_files](examples/mce_files) directory. - -```yml -source: - type: file - config: - filename: ./path/to/mce/file.json -``` - -### dbt `dbt` - -Pull metadata from dbt artifacts files: - -- [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json) - - This file contains model, source and lineage data. -- [dbt catalog file](https://docs.getdbt.com/reference/artifacts/catalog-json) - - This file contains schema data. - - dbt does not record schema data for Ephemeral models, as such datahub will show Ephemeral models in the lineage, however there will be no associated schema for Ephemeral models -- [dbt sources file](https://docs.getdbt.com/reference/artifacts/sources-json) - - This file contains metadata for sources with freshness checks. - - We transfer dbt's freshness checks to DataHub's last-modified fields. - - Note that this file is optional – if not specified, we'll use time of ingestion instead as a proxy for time last-modified. -- target_platform: - - The data platform you are enriching with dbt metadata. - - [data platforms](https://github.com/linkedin/datahub/blob/master/gms/impl/src/main/resources/DataPlatformInfo.json) -- load_schemas: - - Load schemas from dbt catalog file, not necessary when the underlying data platform already has this data. -- node_type_pattern: - - Use this filter to exclude and include node types using allow or deny method - -```yml -source: - type: "dbt" - config: - manifest_path: "./path/dbt/manifest_file.json" - catalog_path: "./path/dbt/catalog_file.json" - sources_path: "./path/dbt/sources_file.json" # (optional, used for freshness checks) - target_platform: "postgres" # optional, eg "postgres", "snowflake", etc. - load_schemas: True or False - node_type_pattern: # optional - deny: - - ^test.* - allow: - - ^.* -``` - -Note: when `load_schemas` is False, models that use [identifiers](https://docs.getdbt.com/reference/resource-properties/identifier) to reference their source tables are ingested using the model identifier as the model name to preserve the lineage. - -### Google BigQuery Usage Stats `bigquery-usage` - -- Fetch a list of queries issued -- Fetch a list of tables and columns accessed -- Aggregate these statistics into buckets, by day or hour granularity - -Note: the client must have one of the following OAuth scopes, and should be authorized on all projects you'd like to ingest usage stats from. - -- https://www.googleapis.com/auth/logging.read -- https://www.googleapis.com/auth/logging.admin -- https://www.googleapis.com/auth/cloud-platform.read-only -- https://www.googleapis.com/auth/cloud-platform - -```yml -source: - type: bigquery-usage - config: - projects: # optional - can autodetect a single project from the environment - - project_id_1 - - project_id_2 - options: - # See https://googleapis.dev/python/logging/latest/client.html for details. - credentials: ~ # optional - see docs - env: PROD - - bucket_duration: "DAY" - start_time: ~ # defaults to the last full day in UTC (or hour) - end_time: ~ # defaults to the last full day in UTC (or hour) - - top_n_queries: 10 # number of queries to save for each table -``` - -:::note - -This source only does usage statistics. To get the tables, views, and schemas in your BigQuery project, use the `bigquery` source. - -::: - -### Snowflake Usage Stats `snowflake-usage` - -- Fetch a list of queries issued -- Fetch a list of tables and columns accessed (excludes views) -- Aggregate these statistics into buckets, by day or hour granularity - -Note: the user/role must have access to the account usage table. The "accountadmin" role has this by default, and other roles can be [granted this permission](https://docs.snowflake.com/en/sql-reference/account-usage.html#enabling-account-usage-for-other-roles). - -Note: the underlying access history views that we use are only available in Snowflake's enterprise edition or higher. - -```yml -source: - type: snowflake-usage - config: - username: user - password: pass - host_port: account_name - role: ACCOUNTADMIN - env: PROD - - bucket_duration: "DAY" - start_time: ~ # defaults to the last full day in UTC (or hour) - end_time: ~ # defaults to the last full day in UTC (or hour) - - top_n_queries: 10 # number of queries to save for each table -``` - -:::note - -This source only does usage statistics. To get the tables, views, and schemas in your Snowflake warehouse, ingest using the `snowflake` source. - -::: - -### Kafka Connect `kafka-connect` - -Extracts: - -- Kafka Connect connector as individual `DataFlowSnapshotClass` entity -- Creating individual `DataJobSnapshotClass` entity using `{connector_name}:{source_dataset}` naming -- Lineage information between source database to Kafka topic - -```yml -source: - type: "kafka-connect" - config: - connect_uri: "http://localhost:8083" - cluster_name: "connect-cluster" - connector_patterns: - deny: - - ^denied-connector.* - allow: - - ^allowed-connector.* -``` - -Current limitations: - -- Currently works only for Debezium source connectors. - ## Sinks ### DataHub Rest `datahub-rest` diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md new file mode 100644 index 0000000000000..726c2521fa49e --- /dev/null +++ b/metadata-ingestion/source_docs/athena.md @@ -0,0 +1,23 @@ +# AWS Athena `athena` + +Extracts: + +- List of databases and tables +- Column types associated with each table + +```yml +source: + type: athena + config: + username: aws_access_key_id # Optional. If not specified, credentials are picked up according to boto3 rules. + # See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html + password: aws_secret_access_key # Optional. + database: database # Optional, defaults to "default" + aws_region: aws_region_name # i.e. "eu-west-1" + s3_staging_dir: s3_location # "s3:///prefix/" + # The s3_staging_dir parameter is needed because Athena always writes query results to S3. + # See https://docs.aws.amazon.com/athena/latest/ug/querying.html + # However, the athena driver will transparently fetch these results as you would expect from any other sql client. + work_group: athena_workgroup # "primary" + # table_pattern/schema_pattern is same as above +``` diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md new file mode 100644 index 0000000000000..5d3eb9f636109 --- /dev/null +++ b/metadata-ingestion/source_docs/bigquery.md @@ -0,0 +1,63 @@ +# Google BigQuery `bigquery` + +Extracts: + +- List of databases, schema, and tables +- Column types associated with each table + +```yml +source: + type: bigquery + config: + project_id: project # optional - can autodetect from environment + options: # options is same as above + # See https://github.com/mxmzdlv/pybigquery#authentication for details. + credentials_path: "/path/to/keyfile.json" # optional + include_views: True # whether to include views, defaults to True + # table_pattern/schema_pattern is same as above +``` + +:::tip + +You can also get fine-grained usage statistics for BigQuery using the `bigquery-usage` source. + +::: + + +# Google BigQuery Usage Stats `bigquery-usage` + +- Fetch a list of queries issued +- Fetch a list of tables and columns accessed +- Aggregate these statistics into buckets, by day or hour granularity + +Note: the client must have one of the following OAuth scopes, and should be authorized on all projects you'd like to ingest usage stats from. + +- https://www.googleapis.com/auth/logging.read +- https://www.googleapis.com/auth/logging.admin +- https://www.googleapis.com/auth/cloud-platform.read-only +- https://www.googleapis.com/auth/cloud-platform + +```yml +source: + type: bigquery-usage + config: + projects: # optional - can autodetect a single project from the environment + - project_id_1 + - project_id_2 + options: + # See https://googleapis.dev/python/logging/latest/client.html for details. + credentials: ~ # optional - see docs + env: PROD + + bucket_duration: "DAY" + start_time: ~ # defaults to the last full day in UTC (or hour) + end_time: ~ # defaults to the last full day in UTC (or hour) + + top_n_queries: 10 # number of queries to save for each table +``` + +:::note + +This source only does usage statistics. To get the tables, views, and schemas in your BigQuery project, use the `bigquery` source. + +::: diff --git a/metadata-ingestion/source_docs/datahub.md b/metadata-ingestion/source_docs/datahub.md new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/source_docs/dbt.md b/metadata-ingestion/source_docs/dbt.md new file mode 100644 index 0000000000000..7e52e30a7daa6 --- /dev/null +++ b/metadata-ingestion/source_docs/dbt.md @@ -0,0 +1,39 @@ +# dbt `dbt` + +Pull metadata from dbt artifacts files: + +- [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json) + - This file contains model, source and lineage data. +- [dbt catalog file](https://docs.getdbt.com/reference/artifacts/catalog-json) + - This file contains schema data. + - dbt does not record schema data for Ephemeral models, as such datahub will show Ephemeral models in the lineage, however there will be no associated schema for Ephemeral models +- [dbt sources file](https://docs.getdbt.com/reference/artifacts/sources-json) + - This file contains metadata for sources with freshness checks. + - We transfer dbt's freshness checks to DataHub's last-modified fields. + - Note that this file is optional – if not specified, we'll use time of ingestion instead as a proxy for time last-modified. +- target_platform: + - The data platform you are enriching with dbt metadata. + - [data platforms](https://github.com/linkedin/datahub/blob/master/gms/impl/src/main/resources/DataPlatformInfo.json) +- load_schemas: + - Load schemas from dbt catalog file, not necessary when the underlying data platform already has this data. +- node_type_pattern: + - Use this filter to exclude and include node types using allow or deny method + +```yml +source: + type: "dbt" + config: + manifest_path: "./path/dbt/manifest_file.json" + catalog_path: "./path/dbt/catalog_file.json" + sources_path: "./path/dbt/sources_file.json" # (optional, used for freshness checks) + target_platform: "postgres" # optional, eg "postgres", "snowflake", etc. + load_schemas: True or False + node_type_pattern: # optional + deny: + - ^test.* + allow: + - ^.* +``` + +Note: when `load_schemas` is False, models that use [identifiers](https://docs.getdbt.com/reference/resource-properties/identifier) to reference their source tables are ingested using the model identifier as the model name to preserve the lineage. + diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md new file mode 100644 index 0000000000000..bd7dae7f2ac73 --- /dev/null +++ b/metadata-ingestion/source_docs/druid.md @@ -0,0 +1,22 @@ +# Druid `druid` + +Extracts: + +- List of databases, schema, and tables +- Column types associated with each table + +**Note** It is important to define a explicitly define deny schema pattern for internal druid databases (lookup & sys) +if adding a schema pattern otherwise the crawler may crash before processing relevant databases. +This deny pattern is defined by default but is overriden by user-submitted configurations + +```yml +source: + type: druid + config: + # Point to broker address + host_port: localhost:8082 + schema_pattern: + deny: + - "^(lookup|sys).*" + # options is same as above +``` diff --git a/metadata-ingestion/source_docs/feast.md b/metadata-ingestion/source_docs/feast.md new file mode 100644 index 0000000000000..24a2c1c72d788 --- /dev/null +++ b/metadata-ingestion/source_docs/feast.md @@ -0,0 +1,22 @@ +# Feast `feast` + +**Note: Feast ingestion requires Docker to be installed.** + +Extracts: + +- List of feature tables (modeled as [`MLFeatureTable`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureTableProperties.pdl)s), + features ([`MLFeature`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl)s), + and entities ([`MLPrimaryKey`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLPrimaryKeyProperties.pdl)s) +- Column types associated with each feature and entity + +Note: this uses a separate Docker container to extract Feast's metadata into a JSON file, which is then +parsed to DataHub's native objects. This was done because of a dependency conflict in the `feast` module. + +```yml +source: + type: feast + config: + core_url: localhost:6565 # default + env: "PROD" # Optional, default is "PROD" + use_local_build: False # Whether to build Feast ingestion image locally, default is False +``` diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/source_docs/file.md new file mode 100644 index 0000000000000..8b0389a753dd7 --- /dev/null +++ b/metadata-ingestion/source_docs/file.md @@ -0,0 +1,12 @@ +### File `file` + +Pulls metadata from a previously generated file. Note that the file sink +can produce such files, and a number of samples are included in the +[examples/mce_files](examples/mce_files) directory. + +```yml +source: + type: file + config: + filename: ./path/to/mce/file.json +``` \ No newline at end of file diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md new file mode 100644 index 0000000000000..ab723ea45af1c --- /dev/null +++ b/metadata-ingestion/source_docs/glue.md @@ -0,0 +1,31 @@ + +# AWS Glue `glue` + +Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](./s3-ingestion.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. + +Extracts: + +- List of tables +- Column types associated with each table +- Table metadata, such as owner, description and parameters +- Jobs and their component transformations, data sources, and data sinks + +```yml +source: + type: glue + config: + aws_region: # aws_region_name, i.e. "eu-west-1" + extract_transforms: True # whether to ingest Glue jobs, defaults to True + env: # environment for the DatasetSnapshot URN, one of "DEV", "EI", "PROD" or "CORP". Defaults to "PROD". + + # Filtering patterns for databases and tables to scan + database_pattern: # Optional, to filter databases scanned, same as schema_pattern above. + table_pattern: # Optional, to filter tables scanned, same as table_pattern above. + + # Credentials. If not specified here, these are picked up according to boto3 rules. + # (see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html) + aws_access_key_id: # Optional. + aws_secret_access_key: # Optional. + aws_session_token: # Optional. + aws_role: # Optional (Role chaining supported by using a sorted list). +``` diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md new file mode 100644 index 0000000000000..3eb36bf620bae --- /dev/null +++ b/metadata-ingestion/source_docs/hive.md @@ -0,0 +1,47 @@ +# Hive `hive` + +Extracts: + +- List of databases, schema, and tables +- Column types associated with each table +- Detailed table and storage information + +```yml +source: + type: hive + config: + # For more details on authentication, see the PyHive docs: + # https://github.com/dropbox/PyHive#passing-session-configuration. + # LDAP, Kerberos, etc. are supported using connect_args, which can be + # added under the `options` config parameter. + #scheme: 'hive+http' # set this if Thrift should use the HTTP transport + #scheme: 'hive+https' # set this if Thrift should use the HTTP with SSL transport + username: user # optional + password: pass # optional + host_port: localhost:10000 + database: DemoDatabase # optional, defaults to 'default' + # table_pattern/schema_pattern is same as above + # options is same as above +``` + +
+ Example: using ingestion with Azure HDInsight + +```yml +# Connecting to Microsoft Azure HDInsight using TLS. +source: + type: hive + config: + scheme: "hive+https" + host_port: .azurehdinsight.net:443 + username: admin + password: "" + options: + connect_args: + http_path: "/hive2" + auth: BASIC + # table_pattern/schema_pattern is same as above +``` + +
+ diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/source_docs/kafka.md new file mode 100644 index 0000000000000..20b49c417f124 --- /dev/null +++ b/metadata-ingestion/source_docs/kafka.md @@ -0,0 +1,47 @@ +# Kafka Metadata `kafka` + +Extracts: + +- List of topics - from the Kafka broker +- Schemas associated with each topic - from the schema registry + +```yml +source: + type: "kafka" + config: + connection: + bootstrap: "broker:9092" + consumer_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.DeserializingConsumer + schema_registry_url: http://localhost:8081 + schema_registry_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.schema_registry.SchemaRegistryClient +``` + +The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively. + +For a full example with a number of security options, see this [example recipe](./examples/recipes/secured_kafka.yml). + + +# Kafka Connect `kafka-connect` + +Extracts: + +- Kafka Connect connector as individual `DataFlowSnapshotClass` entity +- Creating individual `DataJobSnapshotClass` entity using `{connector_name}:{source_dataset}` naming +- Lineage information between source database to Kafka topic + +```yml +source: + type: "kafka-connect" + config: + connect_uri: "http://localhost:8083" + cluster_name: "connect-cluster" + connector_patterns: + deny: + - ^denied-connector.* + allow: + - ^allowed-connector.* +``` + +Current limitations: + +- Currently works only for Debezium source connectors. diff --git a/metadata-ingestion/source_docs/ldap.md b/metadata-ingestion/source_docs/ldap.md new file mode 100644 index 0000000000000..243075da4a83a --- /dev/null +++ b/metadata-ingestion/source_docs/ldap.md @@ -0,0 +1,23 @@ +# LDAP `ldap` + +Extracts: + +- List of people +- Names, emails, titles, and manager information for each person +- List of groups + +```yml +source: + type: "ldap" + config: + ldap_server: ldap://localhost + ldap_user: "cn=admin,dc=example,dc=org" + ldap_password: "admin" + base_dn: "dc=example,dc=org" + filter: "(objectClass=*)" # optional field + drop_missing_first_last_name: False # optional +``` + +The `drop_missing_first_last_name` should be set to true if you've got many "headless" user LDAP accounts +for devices or services should be excluded when they do not contain a first and last name. This will only +impact the ingestion of LDAP users, while LDAP groups will be unaffected by this config option. diff --git a/metadata-ingestion/source_docs/looker.md b/metadata-ingestion/source_docs/looker.md new file mode 100644 index 0000000000000..a195a6ef3b2da --- /dev/null +++ b/metadata-ingestion/source_docs/looker.md @@ -0,0 +1,22 @@ +# Looker dashboards `looker` + +Extracts: + +- Looker dashboards and dashboard elements (charts) +- Names, descriptions, URLs, chart types, input view for the charts + +See the [Looker authentication docs](https://docs.looker.com/reference/api-and-integration/api-auth#authentication_with_an_sdk) for the steps to create a client ID and secret. + +```yml +source: + type: "looker" + config: + client_id: # Your Looker API3 client ID + client_secret: # Your Looker API3 client secret + base_url: # The url to your Looker instance: https://company.looker.com:19999 or https://looker.company.com, or similar. + dashboard_pattern: # supports allow/deny regexes + chart_pattern: # supports allow/deny regexes + actor: urn:li:corpuser:etl # Optional, defaults to urn:li:corpuser:etl + env: "PROD" # Optional, default is "PROD" + platform_name: "looker" # Optional, default is "looker" +``` \ No newline at end of file diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/source_docs/lookml.md new file mode 100644 index 0000000000000..3843ba39bee6d --- /dev/null +++ b/metadata-ingestion/source_docs/lookml.md @@ -0,0 +1,27 @@ +# LookML `lookml` + +Note! This plugin uses a package that requires Python 3.7+! + +Extracts: + +- LookML views from model files +- Name, upstream table names, dimensions, measures, and dimension groups + +```yml +source: + type: "lookml" + config: + base_folder: /path/to/model/files # where the *.model.lkml and *.view.lkml files are stored + connection_to_platform_map: # mappings between connection names in the model files to platform names + connection_name: platform_name (or platform_name.database_name) # for ex. my_snowflake_conn: snowflake.my_database + model_pattern: {} + view_pattern: {} + env: "PROD" # optional, default is "PROD" + parse_table_names_from_sql: False # see note below + platform_name: "looker" # optional, default is "looker" +``` + +Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-metadata/) to try to parse the tables the +views depends on. As these SQL's can be complicated, and the package doesn't official support all the SQL dialects that +Looker supports, the result might not be correct. This parsing is disabled by default, but can be enabled by setting +`parse_table_names_from_sql: True`. diff --git a/metadata-ingestion/source_docs/mongodb.md b/metadata-ingestion/source_docs/mongodb.md new file mode 100644 index 0000000000000..a951c992a2d61 --- /dev/null +++ b/metadata-ingestion/source_docs/mongodb.md @@ -0,0 +1,31 @@ +# MongoDB `mongodb` + +Extracts: + +- List of databases +- List of collections in each database and infers schemas for each collection + +By default, schema inference samples 1,000 documents from each collection. Setting `schemaSamplingSize: null` will scan the entire collection. +Moreover, setting `useRandomSampling: False` will sample the first documents found without random selection, which may be faster for large collections. + +Note that `schemaSamplingSize` has no effect if `enableSchemaInference: False` is set. + +```yml +source: + type: "mongodb" + config: + # For advanced configurations, see the MongoDB docs. + # https://pymongo.readthedocs.io/en/stable/examples/authentication.html + connect_uri: "mongodb://localhost" + username: admin + password: password + env: "PROD" # Optional, default is "PROD" + authMechanism: "DEFAULT" + options: {} + database_pattern: {} + collection_pattern: {} + enableSchemaInference: True + schemaSamplingSize: 1000 + useRandomSampling: True # whether to randomly sample docs for schema or just use the first ones, True by default + # database_pattern/collection_pattern are similar to schema_pattern/table_pattern from above +``` diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md new file mode 100644 index 0000000000000..8317ef7a7deb4 --- /dev/null +++ b/metadata-ingestion/source_docs/mssql.md @@ -0,0 +1,66 @@ +# Microsoft SQL Server Metadata `mssql` + +We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. + +Extracts: + +- List of databases, schema, tables and views +- Column types associated with each table/view + +```yml +source: + type: mssql + config: + username: user + password: pass + host_port: localhost:1433 + database: DemoDatabase + include_views: True # whether to include views, defaults to True + table_pattern: + deny: + - "^.*\\.sys_.*" # deny all tables that start with sys_ + allow: + - "schema1.table1" + - "schema1.table2" + options: + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + charset: "utf8" + # If set to true, we'll use the pyodbc library. This requires you to have + # already installed the Microsoft ODBC Driver for SQL Server. + # See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 + use_odbc: False + uri_args: {} +``` + +
+ Example: using ingestion with ODBC and encryption + +This requires you to have already installed the Microsoft ODBC Driver for SQL Server. +See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 + +```yml +source: + type: mssql + config: + # See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc + use_odbc: True + username: user + password: pass + host_port: localhost:1433 + database: DemoDatabase + include_views: True # whether to include views, defaults to True + uri_args: + # See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15 + driver: "ODBC Driver 17 for SQL Server" + Encrypt: "yes" + TrustServerCertificate: "Yes" + ssl: "True" + # Trusted_Connection: "yes" +``` + +
+ diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/source_docs/mysql.md new file mode 100644 index 0000000000000..ecf564395d41c --- /dev/null +++ b/metadata-ingestion/source_docs/mysql.md @@ -0,0 +1,31 @@ +### MySQL `mysql` + +Extracts: + +- List of databases and tables +- Column types and schema associated with each table + +```yml +source: + type: mysql + config: + username: root + password: example + database: dbname + host_port: localhost:3306 + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "performance_schema" + allow: + - "schema1.table2" + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + - "garbage_schema" + allow: + - "schema1" +``` diff --git a/metadata-ingestion/source_docs/oracle.md b/metadata-ingestion/source_docs/oracle.md new file mode 100644 index 0000000000000..264c5e18804c6 --- /dev/null +++ b/metadata-ingestion/source_docs/oracle.md @@ -0,0 +1,25 @@ +# Oracle `oracle` + +Extracts: + +- List of databases, schema, and tables +- Column types associated with each table + +Using the Oracle source requires that you've also installed the correct drivers; see the [cx_Oracle docs](https://cx-oracle.readthedocs.io/en/latest/user_guide/installation.html). The easiest one is the [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client.html). + +```yml +source: + type: oracle + config: + # For more details on authentication, see the documentation: + # https://docs.sqlalchemy.org/en/14/dialects/oracle.html#dialect-oracle-cx_oracle-connect and + # https://cx-oracle.readthedocs.io/en/latest/user_guide/connection_handling.html#connection-strings. + username: user + password: pass + host_port: localhost:5432 + database: dbname + service_name: svc # omit database if using this option + include_views: True # whether to include views, defaults to True + # table_pattern/schema_pattern is same as above + # options is same as above +``` diff --git a/metadata-ingestion/source_docs/postgres.md b/metadata-ingestion/source_docs/postgres.md new file mode 100644 index 0000000000000..92ffef44a5718 --- /dev/null +++ b/metadata-ingestion/source_docs/postgres.md @@ -0,0 +1,23 @@ +# PostgreSQL `postgres` + +Extracts: + +- List of databases, schema, and tables +- Column types associated with each table +- Also supports PostGIS extensions +- database_alias (optional) can be used to change the name of database to be ingested + +```yml +source: + type: postgres + config: + username: user + password: pass + host_port: localhost:5432 + database: DemoDatabase + database_alias: DatabaseNameToBeIngested + include_views: True # whether to include views, defaults to True + # table_pattern/schema_pattern is same as above + # options is same as above +``` + diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md new file mode 100644 index 0000000000000..22595a055df5a --- /dev/null +++ b/metadata-ingestion/source_docs/redshift.md @@ -0,0 +1,41 @@ +# Redshift `redshift` + +Extracts: + +- List of databases, schema, and tables +- Column types associated with each table +- Also supports PostGIS extensions + +```yml +source: + type: redshift + config: + username: user + password: pass + host_port: example.something.us-west-2.redshift.amazonaws.com:5439 + database: DemoDatabase + include_views: True # whether to include views, defaults to True + # table_pattern/schema_pattern is same as above + # options is same as above +``` + +
+ Extra options when running Redshift behind a proxy + +This requires you to have already installed the Microsoft ODBC Driver for SQL Server. +See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 + +```yml +source: + type: redshift + config: + # username, password, database, etc are all the same as above + host_port: my-proxy-hostname:5439 + options: + connect_args: + sslmode: "prefer" # or "require" or "verify-ca" + sslrootcert: ~ # needed to unpin the AWS Redshift certificate +``` + +
+ diff --git a/metadata-ingestion/source_docs/sagemaker.md b/metadata-ingestion/source_docs/sagemaker.md new file mode 100644 index 0000000000000..295e4b7f1a21e --- /dev/null +++ b/metadata-ingestion/source_docs/sagemaker.md @@ -0,0 +1,34 @@ +# AWS SageMaker `sagemaker` + +Extracts: + +- Feature groups +- Models, jobs, and lineage between the two (e.g. when jobs output a model or a model is used by a job) + +```yml +source: + type: sagemaker + config: + aws_region: # aws_region_name, i.e. "eu-west-1" + env: # environment for the DatasetSnapshot URN, one of "DEV", "EI", "PROD" or "CORP". Defaults to "PROD". + + # Credentials. If not specified here, these are picked up according to boto3 rules. + # (see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html) + aws_access_key_id: # Optional. + aws_secret_access_key: # Optional. + aws_session_token: # Optional. + aws_role: # Optional (Role chaining supported by using a sorted list). + + extract_feature_groups: True # if feature groups should be ingested, default True + extract_models: True # if models should be ingested, default True + extract_jobs: # if jobs should be ingested, default True for all + auto_ml: True + compilation: True + edge_packaging: True + hyper_parameter_tuning: True + labeling: True + processing: True + training: True + transform: True +``` + diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md new file mode 100644 index 0000000000000..c42d55e7b5f3a --- /dev/null +++ b/metadata-ingestion/source_docs/snowflake.md @@ -0,0 +1,68 @@ +# Snowflake `snowflake` + +Extracts: + +- List of databases, schema, and tables +- Column types associated with each table + +```yml +source: + type: snowflake + config: + username: user + password: pass + host_port: account_name + database_pattern: + # The escaping of the $ symbol helps us skip the environment variable substitution. + allow: + - ^MY_DEMO_DATA.* + - ^ANOTHER_DB_REGEX + deny: + - ^SNOWFLAKE\$ + - ^SNOWFLAKE_SAMPLE_DATA\$ + warehouse: "COMPUTE_WH" # optional + role: "sysadmin" # optional + include_views: True # whether to include views, defaults to True + # table_pattern/schema_pattern is same as above + # options is same as above +``` + +:::tip + +You can also get fine-grained usage statistics for Snowflake using the `snowflake-usage` source. + +::: + + +# Snowflake Usage Stats `snowflake-usage` + +- Fetch a list of queries issued +- Fetch a list of tables and columns accessed (excludes views) +- Aggregate these statistics into buckets, by day or hour granularity + +Note: the user/role must have access to the account usage table. The "accountadmin" role has this by default, and other roles can be [granted this permission](https://docs.snowflake.com/en/sql-reference/account-usage.html#enabling-account-usage-for-other-roles). + +Note: the underlying access history views that we use are only available in Snowflake's enterprise edition or higher. + +```yml +source: + type: snowflake-usage + config: + username: user + password: pass + host_port: account_name + role: ACCOUNTADMIN + env: PROD + + bucket_duration: "DAY" + start_time: ~ # defaults to the last full day in UTC (or hour) + end_time: ~ # defaults to the last full day in UTC (or hour) + + top_n_queries: 10 # number of queries to save for each table +``` + +:::note + +This source only does usage statistics. To get the tables, views, and schemas in your Snowflake warehouse, ingest using the `snowflake` source. + +::: diff --git a/metadata-ingestion/source_docs/sqlalchemy.md b/metadata-ingestion/source_docs/sqlalchemy.md new file mode 100644 index 0000000000000..977f40b61f0fa --- /dev/null +++ b/metadata-ingestion/source_docs/sqlalchemy.md @@ -0,0 +1,22 @@ +# Other databases using SQLAlchemy `sqlalchemy` + +The `sqlalchemy` source is useful if we don't have a pre-built source for your chosen +database system, but there is an [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/14/dialects/) +defined elsewhere. In order to use this, you must `pip install` the required dialect packages yourself. + +Extracts: + +- List of schemas and tables +- Column types associated with each table + +```yml +source: + type: sqlalchemy + config: + # See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls + connect_uri: "dialect+driver://username:password@host:port/database" + options: {} # same as above + schema_pattern: {} # same as above + table_pattern: {} # same as above + include_views: True # whether to include views, defaults to True +``` diff --git a/metadata-ingestion/source_docs/superset.md b/metadata-ingestion/source_docs/superset.md new file mode 100644 index 0000000000000..3c6dba608aefc --- /dev/null +++ b/metadata-ingestion/source_docs/superset.md @@ -0,0 +1,19 @@ +# Superset `superset` + +Extracts: + +- List of charts and dashboards + +```yml +source: + type: superset + config: + username: user + password: pass + provider: db | ldap + connect_uri: http://localhost:8088 + env: "PROD" # Optional, default is "PROD" +``` + +See documentation for superset's `/security/login` at https://superset.apache.org/docs/rest-api for more details on superset's login api. + From 0916b7532fc5de9fb0d544fc176ea8acff26b4c1 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 12:41:49 -0700 Subject: [PATCH 02/33] Add links --- docs-website/sidebars.js | 8 ++ metadata-ingestion/README.md | 127 +++++++--------------- metadata-ingestion/sink_docs/console.md | 8 ++ metadata-ingestion/sink_docs/datahub.md | 32 ++++++ metadata-ingestion/sink_docs/file.md | 12 ++ metadata-ingestion/source_docs/datahub.md | 0 metadata-ingestion/source_docs/file.md | 2 +- 7 files changed, 100 insertions(+), 89 deletions(-) create mode 100644 metadata-ingestion/sink_docs/console.md create mode 100644 metadata-ingestion/sink_docs/datahub.md create mode 100644 metadata-ingestion/sink_docs/file.md delete mode 100644 metadata-ingestion/source_docs/datahub.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 42881b1f65460..e8900d80a6357 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -54,6 +54,14 @@ module.exports = { //"docs/what/gms", "datahub-web-react/README", ], + "Metadata Ingestion": [ + // { + // Sources: list_ids_in_directory("metadata-ingestion/source_docs"), + // }, + { + Sinks: list_ids_in_directory("metadata-ingestion/sink_docs"), + }, + ], "Metadata Modeling": [ "docs/modeling/metadata-model", "docs/modeling/extending-the-metadata-model", diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 7d72a70bd0fdd..5ffa0c67383c1 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -28,37 +28,45 @@ If you run into an error, try checking the [_common setup issues_](./developing. #### Installing Plugins -We use a plugin architecture so that you can install only the dependencies you actually need. - -| Plugin Name | Install Command | Provides | -| --------------- | ---------------------------------------------------------- | ----------------------------------- | -| file | _included by default_ | File source and sink | -| console | _included by default_ | Console sink | -| athena | `pip install 'acryl-datahub[athena]'` | AWS Athena source | -| bigquery | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | -| bigquery-usage | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source | -| feast | `pip install 'acryl-datahub[feast]'` | Feast source | -| glue | `pip install 'acryl-datahub[glue]'` | AWS Glue source | -| hive | `pip install 'acryl-datahub[hive]'` | Hive source | -| mssql | `pip install 'acryl-datahub[mssql]'` | SQL Server source | -| mysql | `pip install 'acryl-datahub[mysql]'` | MySQL source | -| oracle | `pip install 'acryl-datahub[oracle]'` | Oracle source | -| postgres | `pip install 'acryl-datahub[postgres]'` | Postgres source | -| redshift | `pip install 'acryl-datahub[redshift]'` | Redshift source | -| sagemaker | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source | -| sqlalchemy | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | -| snowflake | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | -| snowflake-usage | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source | -| superset | `pip install 'acryl-datahub[superset]'` | Superset source | -| mongodb | `pip install 'acryl-datahub[mongodb]'` | MongoDB source | -| ldap | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | -| looker | `pip install 'acryl-datahub[looker]'` | Looker source | -| lookml | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ | -| kafka | `pip install 'acryl-datahub[kafka]'` | Kafka source | -| druid | `pip install 'acryl-datahub[druid]'` | Druid Source | -| dbt | _no additional dependencies_ | dbt source | -| datahub-rest | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API | -| datahub-kafka | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka | +We use a plugin architecture so that you can install only the dependencies you actually need. Click the plugin name to learn more about the specific source recipe and any FAQs! + +Sources: + +| Plugin Name | Install Command | Provides | +| --------------------------------------------- | ---------------------------------------------------------- | ----------------------------------- | --- | +| [file](./source_docs/file.md) | _included by default_ | File source and sink | +| [athena](./source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | +| [bigquery](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | +| [bigquery-usage](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source | +| [dbt](./source_docs/dbt.md) | _no additional dependencies_ | dbt source | +| [druid](./source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source | +| [feast](./source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source | +| [glue](./source_docs/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source | +| [hive](./source_docs/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source | +| [kafka](./source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source | | +| [ldap](./source_docs/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | +| [looker](./source_docs/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source | +| [lookml](./source_docs/lookml.md) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ | +| [mongodb](./source_docs/mongodb.md) | `pip install 'acryl-datahub[mongodb]'` | MongoDB source | +| [mssql](./source_docs/mssql.md) | `pip install 'acryl-datahub[mssql]'` | SQL Server source | +| [mysql](./source_docs/mysql.md) | `pip install 'acryl-datahub[mysql]'` | MySQL source | +| [oracle](./source_docs/oracle.md) | `pip install 'acryl-datahub[oracle]'` | Oracle source | +| [postgres](./source_docs/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source | +| [redshift](./source_docs/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source | +| [sagemaker](./source_docs/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source | +| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | +| [snowflake](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | +| [snowflake-usage](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source | +| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source | + +Sinks + +| Plugin Name | Install Command | Provides | +| --------------------------------------- | -------------------------------------------- | -------------------------- | +| [file](./sink_docs/file.md) | _included by default_ | File source and sink | +| [console](./sink_docs/console.md) | _included by default_ | Console sink | +| [datahub-rest](./sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API | +| [datahub-kafka](./sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka | These plugins can be mixed and matched as desired. For example: @@ -138,63 +146,6 @@ datahub ingest -c ./examples/recipes/mssql_to_datahub.yml A number of recipes are included in the examples/recipes directory. -## Sinks - -### DataHub Rest `datahub-rest` - -Pushes metadata to DataHub using the GMA rest API. The advantage of the rest-based interface -is that any errors can immediately be reported. - -```yml -sink: - type: "datahub-rest" - config: - server: "http://localhost:8080" -``` - -### DataHub Kafka `datahub-kafka` - -Pushes metadata to DataHub by publishing messages to Kafka. The advantage of the Kafka-based -interface is that it's asynchronous and can handle higher throughput. This requires the -Datahub mce-consumer container to be running. - -```yml -sink: - type: "datahub-kafka" - config: - connection: - bootstrap: "localhost:9092" - producer_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.SerializingProducer - schema_registry_url: "http://localhost:8081" - schema_registry_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.schema_registry.SchemaRegistryClient -``` - -The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively. - -For a full example with a number of security options, see this [example recipe](./examples/recipes/secured_kafka.yml). - -### Console `console` - -Simply prints each metadata event to stdout. Useful for experimentation and debugging purposes. - -```yml -sink: - type: "console" -``` - -### File `file` - -Outputs metadata to a file. This can be used to decouple metadata sourcing from the -process of pushing it into DataHub, and is particularly useful for debugging purposes. -Note that the file source can read files generated by this sink. - -```yml -sink: - type: file - config: - filename: ./path/to/mce/file.json -``` - ## Transformations Beyond basic ingestion, sometimes there might exist a need to modify the source data before passing it on to the sink. diff --git a/metadata-ingestion/sink_docs/console.md b/metadata-ingestion/sink_docs/console.md new file mode 100644 index 0000000000000..eb3c00a68574d --- /dev/null +++ b/metadata-ingestion/sink_docs/console.md @@ -0,0 +1,8 @@ +# Console `console` + +Simply prints each metadata event to stdout. Useful for experimentation and debugging purposes. + +```yml +sink: + type: "console" +``` diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md new file mode 100644 index 0000000000000..d08850a1749e4 --- /dev/null +++ b/metadata-ingestion/sink_docs/datahub.md @@ -0,0 +1,32 @@ +# DataHub Rest `datahub-rest` + +Pushes metadata to DataHub using the GMA rest API. The advantage of the rest-based interface +is that any errors can immediately be reported. + +```yml +sink: + type: "datahub-rest" + config: + server: "http://localhost:8080" +``` + +# DataHub Kafka `datahub-kafka` + +Pushes metadata to DataHub by publishing messages to Kafka. The advantage of the Kafka-based +interface is that it's asynchronous and can handle higher throughput. This requires the +Datahub mce-consumer container to be running. + +```yml +sink: + type: "datahub-kafka" + config: + connection: + bootstrap: "localhost:9092" + producer_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.SerializingProducer + schema_registry_url: "http://localhost:8081" + schema_registry_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.schema_registry.SchemaRegistryClient +``` + +The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively. + +For a full example with a number of security options, see this [example recipe](./examples/recipes/secured_kafka.yml). diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/file.md new file mode 100644 index 0000000000000..cc8282cd609c5 --- /dev/null +++ b/metadata-ingestion/sink_docs/file.md @@ -0,0 +1,12 @@ +# File `file` + +Outputs metadata to a file. This can be used to decouple metadata sourcing from the +process of pushing it into DataHub, and is particularly useful for debugging purposes. +Note that the file source can read files generated by this sink. + +```yml +sink: + type: file + config: + filename: ./path/to/mce/file.json +``` diff --git a/metadata-ingestion/source_docs/datahub.md b/metadata-ingestion/source_docs/datahub.md deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/source_docs/file.md index 8b0389a753dd7..a86cf87baf713 100644 --- a/metadata-ingestion/source_docs/file.md +++ b/metadata-ingestion/source_docs/file.md @@ -9,4 +9,4 @@ source: type: file config: filename: ./path/to/mce/file.json -``` \ No newline at end of file +``` From 2bb1d79d6dc56f89872d73c09af2a8562c43950d Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 12:42:35 -0700 Subject: [PATCH 03/33] Fix link --- metadata-ingestion/sink_docs/datahub.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md index d08850a1749e4..bc59af5e7092d 100644 --- a/metadata-ingestion/sink_docs/datahub.md +++ b/metadata-ingestion/sink_docs/datahub.md @@ -29,4 +29,4 @@ sink: The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively. -For a full example with a number of security options, see this [example recipe](./examples/recipes/secured_kafka.yml). +For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). From 487a2b6e7da3e57ddbdc08de8e74a708f0dfeeb1 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 12:58:07 -0700 Subject: [PATCH 04/33] Fix glue link --- docs-website/generateDocsDir.ts | 5 +++++ docs-website/sidebars.js | 6 +++--- metadata-ingestion/source_docs/file.md | 4 ++-- metadata-ingestion/source_docs/glue.md | 3 +-- metadata-ingestion/source_docs/kafka.md | 3 +-- metadata-ingestion/source_docs/mysql.md | 2 +- 6 files changed, 13 insertions(+), 10 deletions(-) diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index 1b142f6b625cc..180d3c97c5548 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -158,6 +158,11 @@ function markdown_guess_title( } else { // Find first h1 header and use it as the title. const headers = contents.content.match(/^# (.+)$/gm); + + if (!headers) { + throw new Error(`${filepath} must have at least one h1 header`); + } + if (headers.length > 1 && contents.content.indexOf("```") < 0) { throw new Error(`too many h1 headers in ${filepath}`); } diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index e8900d80a6357..714b6199602cd 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -55,9 +55,9 @@ module.exports = { "datahub-web-react/README", ], "Metadata Ingestion": [ - // { - // Sources: list_ids_in_directory("metadata-ingestion/source_docs"), - // }, + { + Sources: list_ids_in_directory("metadata-ingestion/source_docs"), + }, { Sinks: list_ids_in_directory("metadata-ingestion/sink_docs"), }, diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/source_docs/file.md index a86cf87baf713..905525c871df2 100644 --- a/metadata-ingestion/source_docs/file.md +++ b/metadata-ingestion/source_docs/file.md @@ -1,8 +1,8 @@ -### File `file` +# File `file` Pulls metadata from a previously generated file. Note that the file sink can produce such files, and a number of samples are included in the -[examples/mce_files](examples/mce_files) directory. +[examples/mce_files](../examples/mce_files) directory. ```yml source: diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md index ab723ea45af1c..c86bc62d476ea 100644 --- a/metadata-ingestion/source_docs/glue.md +++ b/metadata-ingestion/source_docs/glue.md @@ -1,7 +1,6 @@ - # AWS Glue `glue` -Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](./s3-ingestion.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. +Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../s3-ingestion.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. Extracts: diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/source_docs/kafka.md index 20b49c417f124..c447556d6834e 100644 --- a/metadata-ingestion/source_docs/kafka.md +++ b/metadata-ingestion/source_docs/kafka.md @@ -18,8 +18,7 @@ source: The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively. -For a full example with a number of security options, see this [example recipe](./examples/recipes/secured_kafka.yml). - +For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). # Kafka Connect `kafka-connect` diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/source_docs/mysql.md index ecf564395d41c..a68a9dcebb09c 100644 --- a/metadata-ingestion/source_docs/mysql.md +++ b/metadata-ingestion/source_docs/mysql.md @@ -1,4 +1,4 @@ -### MySQL `mysql` +# MySQL `mysql` Extracts: From a24dc59176cd8fcc4ec4e54e9819b540edf6c5c9 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 13:16:47 -0700 Subject: [PATCH 05/33] Add module installs to each page --- metadata-ingestion/README.md | 4 ++-- metadata-ingestion/sink_docs/console.md | 2 +- metadata-ingestion/sink_docs/datahub.md | 8 ++++++-- metadata-ingestion/sink_docs/file.md | 2 +- metadata-ingestion/source_docs/athena.md | 4 +++- metadata-ingestion/source_docs/bigquery.md | 7 +++++-- metadata-ingestion/source_docs/dbt.md | 3 +-- metadata-ingestion/source_docs/druid.md | 4 +++- metadata-ingestion/source_docs/feast.md | 4 +++- metadata-ingestion/source_docs/file.md | 2 +- metadata-ingestion/source_docs/glue.md | 4 +++- metadata-ingestion/source_docs/hive.md | 5 +++-- metadata-ingestion/source_docs/kafka.md | 4 +++- metadata-ingestion/source_docs/ldap.md | 4 +++- metadata-ingestion/source_docs/looker.md | 6 ++++-- metadata-ingestion/source_docs/lookml.md | 4 +++- metadata-ingestion/source_docs/mongodb.md | 4 +++- metadata-ingestion/source_docs/mssql.md | 5 +++-- metadata-ingestion/source_docs/mysql.md | 4 +++- metadata-ingestion/source_docs/oracle.md | 4 +++- metadata-ingestion/source_docs/postgres.md | 5 +++-- metadata-ingestion/source_docs/redshift.md | 5 +++-- metadata-ingestion/source_docs/sagemaker.md | 5 +++-- metadata-ingestion/source_docs/snowflake.md | 7 +++++-- metadata-ingestion/source_docs/sqlalchemy.md | 4 +++- metadata-ingestion/source_docs/superset.md | 5 +++-- 26 files changed, 77 insertions(+), 38 deletions(-) diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 5ffa0c67383c1..9365ff1954f80 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -33,7 +33,7 @@ We use a plugin architecture so that you can install only the dependencies you a Sources: | Plugin Name | Install Command | Provides | -| --------------------------------------------- | ---------------------------------------------------------- | ----------------------------------- | --- | +| --------------------------------------------- | ---------------------------------------------------------- | ----------------------------------- | | [file](./source_docs/file.md) | _included by default_ | File source and sink | | [athena](./source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | | [bigquery](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | @@ -43,7 +43,7 @@ Sources: | [feast](./source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source | | [glue](./source_docs/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source | | [hive](./source_docs/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source | -| [kafka](./source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source | | +| [kafka](./source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source | | [ldap](./source_docs/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | | [looker](./source_docs/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source | | [lookml](./source_docs/lookml.md) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ | diff --git a/metadata-ingestion/sink_docs/console.md b/metadata-ingestion/sink_docs/console.md index eb3c00a68574d..d8bbf7c44c9bd 100644 --- a/metadata-ingestion/sink_docs/console.md +++ b/metadata-ingestion/sink_docs/console.md @@ -1,4 +1,4 @@ -# Console `console` +# Console Simply prints each metadata event to stdout. Useful for experimentation and debugging purposes. diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md index bc59af5e7092d..f77062d7866ae 100644 --- a/metadata-ingestion/sink_docs/datahub.md +++ b/metadata-ingestion/sink_docs/datahub.md @@ -1,4 +1,6 @@ -# DataHub Rest `datahub-rest` +# DataHub Rest + +To install this plugin, run `pip install 'acryl-datahub[datahub-reset]'`. Pushes metadata to DataHub using the GMA rest API. The advantage of the rest-based interface is that any errors can immediately be reported. @@ -10,7 +12,9 @@ sink: server: "http://localhost:8080" ``` -# DataHub Kafka `datahub-kafka` +# DataHub Kafka + +To install this plugin, run `pip install 'acryl-datahub[datahub-kafka]'`. Pushes metadata to DataHub by publishing messages to Kafka. The advantage of the Kafka-based interface is that it's asynchronous and can handle higher throughput. This requires the diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/file.md index cc8282cd609c5..c7cbcc47d43b2 100644 --- a/metadata-ingestion/sink_docs/file.md +++ b/metadata-ingestion/sink_docs/file.md @@ -1,4 +1,4 @@ -# File `file` +# File Outputs metadata to a file. This can be used to decouple metadata sourcing from the process of pushing it into DataHub, and is particularly useful for debugging purposes. diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md index 726c2521fa49e..33e3ad4e08739 100644 --- a/metadata-ingestion/source_docs/athena.md +++ b/metadata-ingestion/source_docs/athena.md @@ -1,4 +1,6 @@ -# AWS Athena `athena` +# AWS Athena + +To install this plugin, run `pip install 'acryl-datahub[athena]'`. Extracts: diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 5d3eb9f636109..16b4c67560c03 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -1,4 +1,6 @@ -# Google BigQuery `bigquery` +# Google BigQuery + +To install this plugin, run `pip install 'acryl-datahub[bigquery]'`. Extracts: @@ -23,8 +25,9 @@ You can also get fine-grained usage statistics for BigQuery using the `bigquery- ::: +# Google BigQuery Usage Stats -# Google BigQuery Usage Stats `bigquery-usage` +To install this plugin, run `pip install 'acryl-datahub[bigquery-usage]'`. - Fetch a list of queries issued - Fetch a list of tables and columns accessed diff --git a/metadata-ingestion/source_docs/dbt.md b/metadata-ingestion/source_docs/dbt.md index 7e52e30a7daa6..4699ef8ca13e8 100644 --- a/metadata-ingestion/source_docs/dbt.md +++ b/metadata-ingestion/source_docs/dbt.md @@ -1,4 +1,4 @@ -# dbt `dbt` +# dbt Pull metadata from dbt artifacts files: @@ -36,4 +36,3 @@ source: ``` Note: when `load_schemas` is False, models that use [identifiers](https://docs.getdbt.com/reference/resource-properties/identifier) to reference their source tables are ingested using the model identifier as the model name to preserve the lineage. - diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md index bd7dae7f2ac73..62efb0d2a6c74 100644 --- a/metadata-ingestion/source_docs/druid.md +++ b/metadata-ingestion/source_docs/druid.md @@ -1,4 +1,6 @@ -# Druid `druid` +# Druid + +To install this plugin, run `pip install 'acryl-datahub[druid]'`. Extracts: diff --git a/metadata-ingestion/source_docs/feast.md b/metadata-ingestion/source_docs/feast.md index 24a2c1c72d788..8cd5bea80d31c 100644 --- a/metadata-ingestion/source_docs/feast.md +++ b/metadata-ingestion/source_docs/feast.md @@ -1,7 +1,9 @@ -# Feast `feast` +# Feast **Note: Feast ingestion requires Docker to be installed.** +To install this plugin, run `pip install 'acryl-datahub[feast]'`. + Extracts: - List of feature tables (modeled as [`MLFeatureTable`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureTableProperties.pdl)s), diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/source_docs/file.md index 905525c871df2..efb5315175344 100644 --- a/metadata-ingestion/source_docs/file.md +++ b/metadata-ingestion/source_docs/file.md @@ -1,4 +1,4 @@ -# File `file` +# File Pulls metadata from a previously generated file. Note that the file sink can produce such files, and a number of samples are included in the diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md index c86bc62d476ea..0d17afab678b5 100644 --- a/metadata-ingestion/source_docs/glue.md +++ b/metadata-ingestion/source_docs/glue.md @@ -1,7 +1,9 @@ -# AWS Glue `glue` +# AWS Glue Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../s3-ingestion.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. +To install this plugin, run `pip install 'acryl-datahub[glue]'`. + Extracts: - List of tables diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md index 3eb36bf620bae..ec9be6daa0db8 100644 --- a/metadata-ingestion/source_docs/hive.md +++ b/metadata-ingestion/source_docs/hive.md @@ -1,4 +1,6 @@ -# Hive `hive` +# Hive + +To install this plugin, run `pip install 'acryl-datahub[hive]'`. Extracts: @@ -44,4 +46,3 @@ source: ``` - diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/source_docs/kafka.md index c447556d6834e..6191d4952d87c 100644 --- a/metadata-ingestion/source_docs/kafka.md +++ b/metadata-ingestion/source_docs/kafka.md @@ -1,4 +1,6 @@ -# Kafka Metadata `kafka` +# Kafka Metadata + +To install this plugin, run `pip install 'acryl-datahub[kafka]'`. Extracts: diff --git a/metadata-ingestion/source_docs/ldap.md b/metadata-ingestion/source_docs/ldap.md index 243075da4a83a..a2682189f181e 100644 --- a/metadata-ingestion/source_docs/ldap.md +++ b/metadata-ingestion/source_docs/ldap.md @@ -1,4 +1,6 @@ -# LDAP `ldap` +# LDAP + +To install this plugin, run `pip install 'acryl-datahub[ldap]'`. Extracts: diff --git a/metadata-ingestion/source_docs/looker.md b/metadata-ingestion/source_docs/looker.md index a195a6ef3b2da..c19c8f125fa22 100644 --- a/metadata-ingestion/source_docs/looker.md +++ b/metadata-ingestion/source_docs/looker.md @@ -1,4 +1,6 @@ -# Looker dashboards `looker` +# Looker dashboards + +To install this plugin, run `pip install 'acryl-datahub[looker]'`. Extracts: @@ -19,4 +21,4 @@ source: actor: urn:li:corpuser:etl # Optional, defaults to urn:li:corpuser:etl env: "PROD" # Optional, default is "PROD" platform_name: "looker" # Optional, default is "looker" -``` \ No newline at end of file +``` diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/source_docs/lookml.md index 3843ba39bee6d..d50384f5f75cc 100644 --- a/metadata-ingestion/source_docs/lookml.md +++ b/metadata-ingestion/source_docs/lookml.md @@ -1,4 +1,6 @@ -# LookML `lookml` +# LookML + +To install this plugin, run `pip install 'acryl-datahub[lookml]'`. Note! This plugin uses a package that requires Python 3.7+! diff --git a/metadata-ingestion/source_docs/mongodb.md b/metadata-ingestion/source_docs/mongodb.md index a951c992a2d61..eb05300ef02c1 100644 --- a/metadata-ingestion/source_docs/mongodb.md +++ b/metadata-ingestion/source_docs/mongodb.md @@ -1,4 +1,6 @@ -# MongoDB `mongodb` +# MongoDB + +To install this plugin, run `pip install 'acryl-datahub[mongodb]'`. Extracts: diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md index 8317ef7a7deb4..99624542dba7b 100644 --- a/metadata-ingestion/source_docs/mssql.md +++ b/metadata-ingestion/source_docs/mssql.md @@ -1,4 +1,6 @@ -# Microsoft SQL Server Metadata `mssql` +# Microsoft SQL Server Metadata + +To install this plugin, run `pip install 'acryl-datahub[mssql]'`. We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. @@ -63,4 +65,3 @@ source: ``` - diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/source_docs/mysql.md index a68a9dcebb09c..03780d106a843 100644 --- a/metadata-ingestion/source_docs/mysql.md +++ b/metadata-ingestion/source_docs/mysql.md @@ -1,4 +1,6 @@ -# MySQL `mysql` +# MySQL + +To install this plugin, run `pip install 'acryl-datahub[mysql]'`. Extracts: diff --git a/metadata-ingestion/source_docs/oracle.md b/metadata-ingestion/source_docs/oracle.md index 264c5e18804c6..d74aa9eb8aa19 100644 --- a/metadata-ingestion/source_docs/oracle.md +++ b/metadata-ingestion/source_docs/oracle.md @@ -1,4 +1,6 @@ -# Oracle `oracle` +# Oracle + +To install this plugin, run `pip install 'acryl-datahub[oracle]'`. Extracts: diff --git a/metadata-ingestion/source_docs/postgres.md b/metadata-ingestion/source_docs/postgres.md index 92ffef44a5718..9991f690d6dbe 100644 --- a/metadata-ingestion/source_docs/postgres.md +++ b/metadata-ingestion/source_docs/postgres.md @@ -1,4 +1,6 @@ -# PostgreSQL `postgres` +# PostgreSQL + +To install this plugin, run `pip install 'acryl-datahub[postgres]'`. Extracts: @@ -20,4 +22,3 @@ source: # table_pattern/schema_pattern is same as above # options is same as above ``` - diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md index 22595a055df5a..311647383cf11 100644 --- a/metadata-ingestion/source_docs/redshift.md +++ b/metadata-ingestion/source_docs/redshift.md @@ -1,4 +1,6 @@ -# Redshift `redshift` +# Redshift + +To install this plugin, run `pip install 'acryl-datahub[redshift]'`. Extracts: @@ -38,4 +40,3 @@ source: ``` - diff --git a/metadata-ingestion/source_docs/sagemaker.md b/metadata-ingestion/source_docs/sagemaker.md index 295e4b7f1a21e..587df3f02970b 100644 --- a/metadata-ingestion/source_docs/sagemaker.md +++ b/metadata-ingestion/source_docs/sagemaker.md @@ -1,4 +1,6 @@ -# AWS SageMaker `sagemaker` +# AWS SageMaker + +To install this plugin, run `pip install 'acryl-datahub[sagemaker]'`. Extracts: @@ -31,4 +33,3 @@ source: training: True transform: True ``` - diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index c42d55e7b5f3a..73b5563728705 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -1,4 +1,6 @@ -# Snowflake `snowflake` +# Snowflake + +To install this plugin, run `pip install 'acryl-datahub[snowflake]'`. Extracts: @@ -33,9 +35,10 @@ You can also get fine-grained usage statistics for Snowflake using the `snowflak ::: - # Snowflake Usage Stats `snowflake-usage` +To install this plugin, run `pip install 'acryl-datahub[snowflake-usage]'`. + - Fetch a list of queries issued - Fetch a list of tables and columns accessed (excludes views) - Aggregate these statistics into buckets, by day or hour granularity diff --git a/metadata-ingestion/source_docs/sqlalchemy.md b/metadata-ingestion/source_docs/sqlalchemy.md index 977f40b61f0fa..57959aaa31411 100644 --- a/metadata-ingestion/source_docs/sqlalchemy.md +++ b/metadata-ingestion/source_docs/sqlalchemy.md @@ -1,4 +1,6 @@ -# Other databases using SQLAlchemy `sqlalchemy` +# Other databases using SQLAlchemy + +To install this plugin, run `pip install 'acryl-datahub[sqlalchemy]'`. The `sqlalchemy` source is useful if we don't have a pre-built source for your chosen database system, but there is an [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/14/dialects/) diff --git a/metadata-ingestion/source_docs/superset.md b/metadata-ingestion/source_docs/superset.md index 3c6dba608aefc..55749c91686e4 100644 --- a/metadata-ingestion/source_docs/superset.md +++ b/metadata-ingestion/source_docs/superset.md @@ -1,4 +1,6 @@ -# Superset `superset` +# Superset + +To install this plugin, run `pip install 'acryl-datahub[superset]'`. Extracts: @@ -16,4 +18,3 @@ source: ``` See documentation for superset's `/security/login` at https://superset.apache.org/docs/rest-api for more details on superset's login api. - From 5c6a19a52cf6c9ab29985aff354336dd83e2c7ce Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 15:07:26 -0700 Subject: [PATCH 06/33] Consistency --- metadata-ingestion/README.md | 4 ++-- metadata-ingestion/source_docs/athena.md | 2 +- metadata-ingestion/source_docs/bigquery.md | 4 ++-- metadata-ingestion/source_docs/dbt.md | 2 +- metadata-ingestion/source_docs/druid.md | 2 +- metadata-ingestion/source_docs/feast.md | 2 +- metadata-ingestion/source_docs/file.md | 2 +- metadata-ingestion/source_docs/glue.md | 6 +++--- metadata-ingestion/source_docs/hive.md | 2 +- metadata-ingestion/source_docs/kafka.md | 4 ++-- metadata-ingestion/source_docs/ldap.md | 2 +- metadata-ingestion/source_docs/looker.md | 2 +- metadata-ingestion/source_docs/lookml.md | 2 +- metadata-ingestion/source_docs/mongodb.md | 2 +- metadata-ingestion/source_docs/mssql.md | 2 +- metadata-ingestion/source_docs/mysql.md | 2 +- metadata-ingestion/source_docs/oracle.md | 2 +- metadata-ingestion/source_docs/postgres.md | 2 +- metadata-ingestion/source_docs/redshift.md | 2 +- metadata-ingestion/source_docs/sagemaker.md | 2 +- metadata-ingestion/source_docs/snowflake.md | 2 +- metadata-ingestion/source_docs/sqlalchemy.md | 2 +- metadata-ingestion/source_docs/superset.md | 2 +- 23 files changed, 28 insertions(+), 28 deletions(-) diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 9365ff1954f80..41826a668a220 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -54,9 +54,9 @@ Sources: | [postgres](./source_docs/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source | | [redshift](./source_docs/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source | | [sagemaker](./source_docs/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source | -| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | | [snowflake](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | | [snowflake-usage](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source | +| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | | [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source | Sinks @@ -144,7 +144,7 @@ Running a recipe is quite easy. datahub ingest -c ./examples/recipes/mssql_to_datahub.yml ``` -A number of recipes are included in the examples/recipes directory. +A number of recipes are included in the [examples/recipes](./examples/recipes) directory. See also pages described in the [table of plugins](#installing-plugins) for more context on recipe options for each source and sink. ## Transformations diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md index 33e3ad4e08739..5e0dcda9c4b23 100644 --- a/metadata-ingestion/source_docs/athena.md +++ b/metadata-ingestion/source_docs/athena.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[athena]'`. -Extracts: +This plugin extracts the following: - List of databases and tables - Column types associated with each table diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 16b4c67560c03..f705bbe5ddd3e 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[bigquery]'`. -Extracts: +This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table @@ -21,7 +21,7 @@ source: :::tip -You can also get fine-grained usage statistics for BigQuery using the `bigquery-usage` source. +You can also get fine-grained usage statistics for BigQuery using the `bigquery-usage` source described below. ::: diff --git a/metadata-ingestion/source_docs/dbt.md b/metadata-ingestion/source_docs/dbt.md index 4699ef8ca13e8..8a4f72794e7f8 100644 --- a/metadata-ingestion/source_docs/dbt.md +++ b/metadata-ingestion/source_docs/dbt.md @@ -1,6 +1,6 @@ # dbt -Pull metadata from dbt artifacts files: +This plugin pulls metadata from dbt's artifact files: - [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json) - This file contains model, source and lineage data. diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md index 62efb0d2a6c74..e719b3c1ea5f5 100644 --- a/metadata-ingestion/source_docs/druid.md +++ b/metadata-ingestion/source_docs/druid.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[druid]'`. -Extracts: +This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table diff --git a/metadata-ingestion/source_docs/feast.md b/metadata-ingestion/source_docs/feast.md index 8cd5bea80d31c..46b16b41be223 100644 --- a/metadata-ingestion/source_docs/feast.md +++ b/metadata-ingestion/source_docs/feast.md @@ -4,7 +4,7 @@ To install this plugin, run `pip install 'acryl-datahub[feast]'`. -Extracts: +This plugin extracts the following: - List of feature tables (modeled as [`MLFeatureTable`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureTableProperties.pdl)s), features ([`MLFeature`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl)s), diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/source_docs/file.md index efb5315175344..0b1ba7c504dad 100644 --- a/metadata-ingestion/source_docs/file.md +++ b/metadata-ingestion/source_docs/file.md @@ -1,6 +1,6 @@ # File -Pulls metadata from a previously generated file. Note that the file sink +This plugin pulls metadata from a previously generated file. The file sink can produce such files, and a number of samples are included in the [examples/mce_files](../examples/mce_files) directory. diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md index 0d17afab678b5..be65d0e5c0567 100644 --- a/metadata-ingestion/source_docs/glue.md +++ b/metadata-ingestion/source_docs/glue.md @@ -1,10 +1,10 @@ # AWS Glue -Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../s3-ingestion.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. - To install this plugin, run `pip install 'acryl-datahub[glue]'`. -Extracts: +Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../s3-ingestion.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. + +This plugin extracts the following: - List of tables - Column types associated with each table diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md index ec9be6daa0db8..c3125e8ec9299 100644 --- a/metadata-ingestion/source_docs/hive.md +++ b/metadata-ingestion/source_docs/hive.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[hive]'`. -Extracts: +This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/source_docs/kafka.md index 6191d4952d87c..d87f0f4236b17 100644 --- a/metadata-ingestion/source_docs/kafka.md +++ b/metadata-ingestion/source_docs/kafka.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[kafka]'`. -Extracts: +This plugin extracts the following: - List of topics - from the Kafka broker - Schemas associated with each topic - from the schema registry @@ -24,7 +24,7 @@ For a full example with a number of security options, see this [example recipe]( # Kafka Connect `kafka-connect` -Extracts: +This plugin extracts the following: - Kafka Connect connector as individual `DataFlowSnapshotClass` entity - Creating individual `DataJobSnapshotClass` entity using `{connector_name}:{source_dataset}` naming diff --git a/metadata-ingestion/source_docs/ldap.md b/metadata-ingestion/source_docs/ldap.md index a2682189f181e..b1df1f385a99f 100644 --- a/metadata-ingestion/source_docs/ldap.md +++ b/metadata-ingestion/source_docs/ldap.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[ldap]'`. -Extracts: +This plugin extracts the following: - List of people - Names, emails, titles, and manager information for each person diff --git a/metadata-ingestion/source_docs/looker.md b/metadata-ingestion/source_docs/looker.md index c19c8f125fa22..c395781b7a2d9 100644 --- a/metadata-ingestion/source_docs/looker.md +++ b/metadata-ingestion/source_docs/looker.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[looker]'`. -Extracts: +This plugin extracts the following: - Looker dashboards and dashboard elements (charts) - Names, descriptions, URLs, chart types, input view for the charts diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/source_docs/lookml.md index d50384f5f75cc..407656c583850 100644 --- a/metadata-ingestion/source_docs/lookml.md +++ b/metadata-ingestion/source_docs/lookml.md @@ -4,7 +4,7 @@ To install this plugin, run `pip install 'acryl-datahub[lookml]'`. Note! This plugin uses a package that requires Python 3.7+! -Extracts: +This plugin extracts the following: - LookML views from model files - Name, upstream table names, dimensions, measures, and dimension groups diff --git a/metadata-ingestion/source_docs/mongodb.md b/metadata-ingestion/source_docs/mongodb.md index eb05300ef02c1..9cb1140e7bced 100644 --- a/metadata-ingestion/source_docs/mongodb.md +++ b/metadata-ingestion/source_docs/mongodb.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[mongodb]'`. -Extracts: +This plugin extracts the following: - List of databases - List of collections in each database and infers schemas for each collection diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md index 99624542dba7b..85c6be77c4939 100644 --- a/metadata-ingestion/source_docs/mssql.md +++ b/metadata-ingestion/source_docs/mssql.md @@ -4,7 +4,7 @@ To install this plugin, run `pip install 'acryl-datahub[mssql]'`. We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. -Extracts: +This plugin extracts the following: - List of databases, schema, tables and views - Column types associated with each table/view diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/source_docs/mysql.md index 03780d106a843..ae4f1b1823614 100644 --- a/metadata-ingestion/source_docs/mysql.md +++ b/metadata-ingestion/source_docs/mysql.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[mysql]'`. -Extracts: +This plugin extracts the following: - List of databases and tables - Column types and schema associated with each table diff --git a/metadata-ingestion/source_docs/oracle.md b/metadata-ingestion/source_docs/oracle.md index d74aa9eb8aa19..b516cc2dac716 100644 --- a/metadata-ingestion/source_docs/oracle.md +++ b/metadata-ingestion/source_docs/oracle.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[oracle]'`. -Extracts: +This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table diff --git a/metadata-ingestion/source_docs/postgres.md b/metadata-ingestion/source_docs/postgres.md index 9991f690d6dbe..5f41cd0fd95cc 100644 --- a/metadata-ingestion/source_docs/postgres.md +++ b/metadata-ingestion/source_docs/postgres.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[postgres]'`. -Extracts: +This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md index 311647383cf11..c8ad1aa4259f7 100644 --- a/metadata-ingestion/source_docs/redshift.md +++ b/metadata-ingestion/source_docs/redshift.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[redshift]'`. -Extracts: +This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table diff --git a/metadata-ingestion/source_docs/sagemaker.md b/metadata-ingestion/source_docs/sagemaker.md index 587df3f02970b..f6ea7009b2448 100644 --- a/metadata-ingestion/source_docs/sagemaker.md +++ b/metadata-ingestion/source_docs/sagemaker.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[sagemaker]'`. -Extracts: +This plugin extracts the following: - Feature groups - Models, jobs, and lineage between the two (e.g. when jobs output a model or a model is used by a job) diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index 73b5563728705..2c90ffb9c9ff9 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[snowflake]'`. -Extracts: +This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table diff --git a/metadata-ingestion/source_docs/sqlalchemy.md b/metadata-ingestion/source_docs/sqlalchemy.md index 57959aaa31411..272db30599da5 100644 --- a/metadata-ingestion/source_docs/sqlalchemy.md +++ b/metadata-ingestion/source_docs/sqlalchemy.md @@ -6,7 +6,7 @@ The `sqlalchemy` source is useful if we don't have a pre-built source for your c database system, but there is an [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/14/dialects/) defined elsewhere. In order to use this, you must `pip install` the required dialect packages yourself. -Extracts: +This plugin extracts the following: - List of schemas and tables - Column types associated with each table diff --git a/metadata-ingestion/source_docs/superset.md b/metadata-ingestion/source_docs/superset.md index 55749c91686e4..5b83566edc960 100644 --- a/metadata-ingestion/source_docs/superset.md +++ b/metadata-ingestion/source_docs/superset.md @@ -2,7 +2,7 @@ To install this plugin, run `pip install 'acryl-datahub[superset]'`. -Extracts: +This plugin extracts the following: - List of charts and dashboards From 2382c3037b6c01c663681a7008f020c7d82d3cec Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 15:36:06 -0700 Subject: [PATCH 07/33] Standardize sqlalchemy pattern --- metadata-ingestion/source_docs/athena.md | 3 ++ metadata-ingestion/source_docs/bigquery.md | 32 ++++++++++++++- metadata-ingestion/source_docs/druid.md | 34 +++++++++++++++- metadata-ingestion/source_docs/glue.md | 11 ++--- metadata-ingestion/source_docs/hive.md | 36 ++++++++++++++++- metadata-ingestion/source_docs/mssql.md | 42 +++++++++++++++----- metadata-ingestion/source_docs/mysql.md | 27 +++++++++++-- metadata-ingestion/source_docs/oracle.md | 35 +++++++++++++++- metadata-ingestion/source_docs/postgres.md | 35 +++++++++++++++- metadata-ingestion/source_docs/redshift.md | 35 +++++++++++++++- metadata-ingestion/source_docs/snowflake.md | 35 +++++++++++++++- metadata-ingestion/source_docs/sqlalchemy.md | 36 +++++++++++++++-- 12 files changed, 326 insertions(+), 35 deletions(-) diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md index 5e0dcda9c4b23..7792511729487 100644 --- a/metadata-ingestion/source_docs/athena.md +++ b/metadata-ingestion/source_docs/athena.md @@ -15,11 +15,14 @@ source: # See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html password: aws_secret_access_key # Optional. database: database # Optional, defaults to "default" + aws_region: aws_region_name # i.e. "eu-west-1" + s3_staging_dir: s3_location # "s3:///prefix/" # The s3_staging_dir parameter is needed because Athena always writes query results to S3. # See https://docs.aws.amazon.com/athena/latest/ug/querying.html # However, the athena driver will transparently fetch these results as you would expect from any other sql client. + work_group: athena_workgroup # "primary" # table_pattern/schema_pattern is same as above ``` diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index f705bbe5ddd3e..d5c1d15b95aa6 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -12,11 +12,41 @@ source: type: bigquery config: project_id: project # optional - can autodetect from environment + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. options: # options is same as above # See https://github.com/mxmzdlv/pybigquery#authentication for details. credentials_path: "/path/to/keyfile.json" # optional + + # Tables to allow/deny + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" + allow: + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + - "bad_schema" + - "junk_table" + allow: + - "good_schema" + - "excellent_schema" + include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above ``` :::tip diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md index e719b3c1ea5f5..a2e17e429dd4e 100644 --- a/metadata-ingestion/source_docs/druid.md +++ b/metadata-ingestion/source_docs/druid.md @@ -17,8 +17,38 @@ source: config: # Point to broker address host_port: localhost:8082 + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + options: + # driver_option: some-option + + # Tables to allow/deny + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" + allow: + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "^(lookup|sys).*" - # options is same as above + - "bad_schema" + - "junk_table" + allow: + - "good_schema" + - "excellent_schema" + + include_views: True # whether to include views, defaults to True ``` diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md index be65d0e5c0567..661bab8f6a759 100644 --- a/metadata-ingestion/source_docs/glue.md +++ b/metadata-ingestion/source_docs/glue.md @@ -16,17 +16,18 @@ source: type: glue config: aws_region: # aws_region_name, i.e. "eu-west-1" - extract_transforms: True # whether to ingest Glue jobs, defaults to True env: # environment for the DatasetSnapshot URN, one of "DEV", "EI", "PROD" or "CORP". Defaults to "PROD". - # Filtering patterns for databases and tables to scan - database_pattern: # Optional, to filter databases scanned, same as schema_pattern above. - table_pattern: # Optional, to filter tables scanned, same as table_pattern above. - # Credentials. If not specified here, these are picked up according to boto3 rules. # (see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html) aws_access_key_id: # Optional. aws_secret_access_key: # Optional. aws_session_token: # Optional. aws_role: # Optional (Role chaining supported by using a sorted list). + + extract_transforms: True # whether to ingest Glue jobs, defaults to True + + # Filtering patterns for databases and tables to scan + database_pattern: # Optional, to filter databases scanned, same as schema_pattern above. + table_pattern: # Optional, to filter tables scanned, same as table_pattern above. ``` diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md index c3125e8ec9299..a0448728adebc 100644 --- a/metadata-ingestion/source_docs/hive.md +++ b/metadata-ingestion/source_docs/hive.md @@ -22,8 +22,40 @@ source: password: pass # optional host_port: localhost:10000 database: DemoDatabase # optional, defaults to 'default' - # table_pattern/schema_pattern is same as above - # options is same as above + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + options: + # driver_option: some-option + + # Tables to allow/deny + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" + allow: + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + - "bad_schema" + - "junk_table" + allow: + - "good_schema" + - "excellent_schema" + + include_views: True # whether to include views, defaults to True ```
diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md index 85c6be77c4939..2104cdd7445f7 100644 --- a/metadata-ingestion/source_docs/mssql.md +++ b/metadata-ingestion/source_docs/mssql.md @@ -17,20 +17,42 @@ source: password: pass host_port: localhost:1433 database: DemoDatabase - include_views: True # whether to include views, defaults to True + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + options: + charset: "utf8" + + # Tables to allow/deny table_pattern: deny: + # Note that the deny patterns take precedence over the allow patterns. - "^.*\\.sys_.*" # deny all tables that start with sys_ + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" allow: - - "schema1.table1" - - "schema1.table2" - options: - # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. - # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. - # Many of these options are specific to the underlying database driver, so that library's - # documentation will be a good reference for what is supported. To find which dialect is likely - # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. - charset: "utf8" + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + - "bad_schema" + - "junk_table" + allow: + - "good_schema" + - "excellent_schema" + + include_views: True # whether to include views, defaults to True + # If set to true, we'll use the pyodbc library. This requires you to have # already installed the Microsoft ODBC Driver for SQL Server. # See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/source_docs/mysql.md index ae4f1b1823614..a6d159760f0ac 100644 --- a/metadata-ingestion/source_docs/mysql.md +++ b/metadata-ingestion/source_docs/mysql.md @@ -15,19 +15,38 @@ source: password: example database: dbname host_port: localhost:3306 + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + options: + # driver_option: some-option + + # Tables to allow/deny table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. - - "performance_schema" + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" allow: - - "schema1.table2" + - "good_table" + - "excellent_table" + # Although the 'table_pattern' enables you to skip everything from certain schemas, # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. schema_pattern: deny: - - "garbage_schema" + - "bad_schema" + - "junk_table" allow: - - "schema1" + - "good_schema" + - "excellent_schema" + + include_views: True # whether to include views, defaults to True ``` diff --git a/metadata-ingestion/source_docs/oracle.md b/metadata-ingestion/source_docs/oracle.md index b516cc2dac716..fc0bd1e8f63ef 100644 --- a/metadata-ingestion/source_docs/oracle.md +++ b/metadata-ingestion/source_docs/oracle.md @@ -21,7 +21,38 @@ source: host_port: localhost:5432 database: dbname service_name: svc # omit database if using this option + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + options: + # driver_option: some-option + + # Tables to allow/deny + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" + allow: + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + - "bad_schema" + - "junk_table" + allow: + - "good_schema" + - "excellent_schema" + include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above - # options is same as above ``` diff --git a/metadata-ingestion/source_docs/postgres.md b/metadata-ingestion/source_docs/postgres.md index 5f41cd0fd95cc..b8042c69d2e53 100644 --- a/metadata-ingestion/source_docs/postgres.md +++ b/metadata-ingestion/source_docs/postgres.md @@ -18,7 +18,38 @@ source: host_port: localhost:5432 database: DemoDatabase database_alias: DatabaseNameToBeIngested + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + options: + # driver_option: some-option + + # Tables to allow/deny + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" + allow: + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + - "bad_schema" + - "junk_table" + allow: + - "good_schema" + - "excellent_schema" + include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above - # options is same as above ``` diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md index c8ad1aa4259f7..bf74400a0103c 100644 --- a/metadata-ingestion/source_docs/redshift.md +++ b/metadata-ingestion/source_docs/redshift.md @@ -16,9 +16,40 @@ source: password: pass host_port: example.something.us-west-2.redshift.amazonaws.com:5439 database: DemoDatabase + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + options: + # driver_option: some-option + + # Tables to allow/deny + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" + allow: + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + - "bad_schema" + - "junk_table" + allow: + - "good_schema" + - "excellent_schema" + include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above - # options is same as above ```
diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index 2c90ffb9c9ff9..57ede6a5d248a 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -24,9 +24,40 @@ source: - ^SNOWFLAKE_SAMPLE_DATA\$ warehouse: "COMPUTE_WH" # optional role: "sysadmin" # optional + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + options: + # driver_option: some-option + + # Tables to allow/deny + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" + allow: + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + - "bad_schema" + - "junk_table" + allow: + - "good_schema" + - "excellent_schema" + include_views: True # whether to include views, defaults to True - # table_pattern/schema_pattern is same as above - # options is same as above ``` :::tip diff --git a/metadata-ingestion/source_docs/sqlalchemy.md b/metadata-ingestion/source_docs/sqlalchemy.md index 272db30599da5..13e4c7e6b02f0 100644 --- a/metadata-ingestion/source_docs/sqlalchemy.md +++ b/metadata-ingestion/source_docs/sqlalchemy.md @@ -17,8 +17,38 @@ source: config: # See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls connect_uri: "dialect+driver://username:password@host:port/database" - options: {} # same as above - schema_pattern: {} # same as above - table_pattern: {} # same as above + + # Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. + # See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. + # Many of these options are specific to the underlying database driver, so that library's + # documentation will be a good reference for what is supported. To find which dialect is likely + # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. + options: + # driver_option: some-option + + # Tables to allow/deny + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" + allow: + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + - "bad_schema" + - "junk_table" + allow: + - "good_schema" + - "excellent_schema" + include_views: True # whether to include views, defaults to True ``` From 34fbccf5ff7f8daa14b416292824cd07ab2691a7 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 15:39:36 -0700 Subject: [PATCH 08/33] Add missing sql options --- metadata-ingestion/source_docs/bigquery.md | 22 +++++++++++++------- metadata-ingestion/source_docs/druid.md | 22 +++++++++++++------- metadata-ingestion/source_docs/hive.md | 22 +++++++++++++------- metadata-ingestion/source_docs/mssql.md | 22 +++++++++++++------- metadata-ingestion/source_docs/mysql.md | 22 +++++++++++++------- metadata-ingestion/source_docs/oracle.md | 22 +++++++++++++------- metadata-ingestion/source_docs/postgres.md | 22 +++++++++++++------- metadata-ingestion/source_docs/redshift.md | 22 +++++++++++++------- metadata-ingestion/source_docs/snowflake.md | 22 +++++++++++++------- metadata-ingestion/source_docs/sqlalchemy.md | 22 +++++++++++++------- 10 files changed, 140 insertions(+), 80 deletions(-) diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index d5c1d15b95aa6..93f5b4949518e 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -34,19 +34,25 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True ``` :::tip diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md index a2e17e429dd4e..df43204e2eb40 100644 --- a/metadata-ingestion/source_docs/druid.md +++ b/metadata-ingestion/source_docs/druid.md @@ -38,17 +38,23 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True ``` diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md index a0448728adebc..9b96457f84eb1 100644 --- a/metadata-ingestion/source_docs/hive.md +++ b/metadata-ingestion/source_docs/hive.md @@ -43,19 +43,25 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True ```
diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md index 2104cdd7445f7..12133cf439398 100644 --- a/metadata-ingestion/source_docs/mssql.md +++ b/metadata-ingestion/source_docs/mssql.md @@ -39,19 +39,25 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True # If set to true, we'll use the pyodbc library. This requires you to have # already installed the Microsoft ODBC Driver for SQL Server. diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/source_docs/mysql.md index a6d159760f0ac..2ede14967c1d0 100644 --- a/metadata-ingestion/source_docs/mysql.md +++ b/metadata-ingestion/source_docs/mysql.md @@ -36,17 +36,23 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True ``` diff --git a/metadata-ingestion/source_docs/oracle.md b/metadata-ingestion/source_docs/oracle.md index fc0bd1e8f63ef..6550d74c4ea7f 100644 --- a/metadata-ingestion/source_docs/oracle.md +++ b/metadata-ingestion/source_docs/oracle.md @@ -42,17 +42,23 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True ``` diff --git a/metadata-ingestion/source_docs/postgres.md b/metadata-ingestion/source_docs/postgres.md index b8042c69d2e53..94bcacaa49775 100644 --- a/metadata-ingestion/source_docs/postgres.md +++ b/metadata-ingestion/source_docs/postgres.md @@ -39,17 +39,23 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True ``` diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md index bf74400a0103c..609dfd8d87715 100644 --- a/metadata-ingestion/source_docs/redshift.md +++ b/metadata-ingestion/source_docs/redshift.md @@ -37,19 +37,25 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True ```
diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index 57ede6a5d248a..a1badd26f0aa6 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -45,19 +45,25 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True ``` :::tip diff --git a/metadata-ingestion/source_docs/sqlalchemy.md b/metadata-ingestion/source_docs/sqlalchemy.md index 13e4c7e6b02f0..ad20ed77bfc4c 100644 --- a/metadata-ingestion/source_docs/sqlalchemy.md +++ b/metadata-ingestion/source_docs/sqlalchemy.md @@ -38,17 +38,23 @@ source: - "good_table" - "excellent_table" - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. schema_pattern: deny: - - "bad_schema" - - "junk_table" + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... allow: - - "good_schema" - - "excellent_schema" + # ... include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True ``` From 9808735a4d31827edc1d606d0c9403b0f2ab153b Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 15:58:54 -0700 Subject: [PATCH 09/33] More consistent recipes --- metadata-ingestion/source_docs/bigquery.md | 4 +++- metadata-ingestion/source_docs/dbt.md | 10 ++++++++++ metadata-ingestion/source_docs/glue.md | 17 +++++++++++++++-- metadata-ingestion/source_docs/hive.md | 1 - metadata-ingestion/source_docs/kafka.md | 19 +++++++++++++++++-- 5 files changed, 45 insertions(+), 6 deletions(-) diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 93f5b4949518e..2aec46deade94 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -86,13 +86,15 @@ source: options: # See https://googleapis.dev/python/logging/latest/client.html for details. credentials: ~ # optional - see docs - env: PROD + # Common usage stats options bucket_duration: "DAY" start_time: ~ # defaults to the last full day in UTC (or hour) end_time: ~ # defaults to the last full day in UTC (or hour) top_n_queries: 10 # number of queries to save for each table + + env: PROD ``` :::note diff --git a/metadata-ingestion/source_docs/dbt.md b/metadata-ingestion/source_docs/dbt.md index 8a4f72794e7f8..aee7c4736cf08 100644 --- a/metadata-ingestion/source_docs/dbt.md +++ b/metadata-ingestion/source_docs/dbt.md @@ -23,11 +23,21 @@ This plugin pulls metadata from dbt's artifact files: source: type: "dbt" config: + # https://docs.getdbt.com/reference/artifacts/manifest-json manifest_path: "./path/dbt/manifest_file.json" + # https://docs.getdbt.com/reference/artifacts/catalog-json catalog_path: "./path/dbt/catalog_file.json" + # https://docs.getdbt.com/reference/artifacts/sources-json sources_path: "./path/dbt/sources_file.json" # (optional, used for freshness checks) + + # the platform that dbt is loading onto target_platform: "postgres" # optional, eg "postgres", "snowflake", etc. + + # whether to load schemas of datasets from dbt + # (otherwise, only includes a simple list of tables) load_schemas: True or False + + # regex pattern to allow/deny nodes node_type_pattern: # optional deny: - ^test.* diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md index 661bab8f6a759..a51add12c54be 100644 --- a/metadata-ingestion/source_docs/glue.md +++ b/metadata-ingestion/source_docs/glue.md @@ -27,7 +27,20 @@ source: extract_transforms: True # whether to ingest Glue jobs, defaults to True - # Filtering patterns for databases and tables to scan - database_pattern: # Optional, to filter databases scanned, same as schema_pattern above. + # Regex filters for databases to scan + database_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_database" + - "junk_database" + # Can also be a regular expression + - "(old|used|deprecated)_database" + allow: + - "good_database" + - "excellent_database" table_pattern: # Optional, to filter tables scanned, same as table_pattern above. + deny: + # ... + allow: + # ... ``` diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md index 9b96457f84eb1..e448ebba63fce 100644 --- a/metadata-ingestion/source_docs/hive.md +++ b/metadata-ingestion/source_docs/hive.md @@ -60,7 +60,6 @@ source: allow: # ... - include_views: True # whether to include views, defaults to True include_tables: True # whether to include views, defaults to True ``` diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/source_docs/kafka.md index d87f0f4236b17..8fb57eda3b8f0 100644 --- a/metadata-ingestion/source_docs/kafka.md +++ b/metadata-ingestion/source_docs/kafka.md @@ -13,9 +13,24 @@ source: config: connection: bootstrap: "broker:9092" - consumer_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.DeserializingConsumer schema_registry_url: http://localhost:8081 - schema_registry_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.schema_registry.SchemaRegistryClient + + # Extra schema registry config. + # These options will be passed into Kafka's SchemaRegistryClient. + # See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html?#schemaregistryclient + schema_registry_config: {} + + # Extra consumer config. + # These options will be passed into Kafka's DeserializingConsumer. + # See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer + # and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md. + consumer_config: {} + + # Extra producer config. + # These options will be passed into Kafka's SerializingProducer. + # See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#serializingproducer + # and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md. + producer_config: {} ``` The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively. From 9af3cab61a66fbee0b27de8df43ab7ada31f5fed Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 16:41:25 -0700 Subject: [PATCH 10/33] Finish consistency checks for recipes --- metadata-ingestion/README.md | 53 ++++++++++--------- metadata-ingestion/source_docs/bigquery.md | 15 ++++++ .../source_docs/kafka-connect.md | 24 +++++++++ metadata-ingestion/source_docs/kafka.md | 25 --------- metadata-ingestion/source_docs/ldap.md | 7 +++ metadata-ingestion/source_docs/looker.md | 20 +++++-- metadata-ingestion/source_docs/lookml.md | 20 +++++-- metadata-ingestion/source_docs/mongodb.md | 25 ++++++--- metadata-ingestion/source_docs/mssql.md | 1 + metadata-ingestion/source_docs/snowflake.md | 19 ++++--- metadata-ingestion/source_docs/superset.md | 4 +- 11 files changed, 141 insertions(+), 72 deletions(-) create mode 100644 metadata-ingestion/source_docs/kafka-connect.md diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 41826a668a220..d13f846479950 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -32,32 +32,33 @@ We use a plugin architecture so that you can install only the dependencies you a Sources: -| Plugin Name | Install Command | Provides | -| --------------------------------------------- | ---------------------------------------------------------- | ----------------------------------- | -| [file](./source_docs/file.md) | _included by default_ | File source and sink | -| [athena](./source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | -| [bigquery](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | -| [bigquery-usage](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source | -| [dbt](./source_docs/dbt.md) | _no additional dependencies_ | dbt source | -| [druid](./source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source | -| [feast](./source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source | -| [glue](./source_docs/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source | -| [hive](./source_docs/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source | -| [kafka](./source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source | -| [ldap](./source_docs/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | -| [looker](./source_docs/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source | -| [lookml](./source_docs/lookml.md) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ | -| [mongodb](./source_docs/mongodb.md) | `pip install 'acryl-datahub[mongodb]'` | MongoDB source | -| [mssql](./source_docs/mssql.md) | `pip install 'acryl-datahub[mssql]'` | SQL Server source | -| [mysql](./source_docs/mysql.md) | `pip install 'acryl-datahub[mysql]'` | MySQL source | -| [oracle](./source_docs/oracle.md) | `pip install 'acryl-datahub[oracle]'` | Oracle source | -| [postgres](./source_docs/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source | -| [redshift](./source_docs/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source | -| [sagemaker](./source_docs/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source | -| [snowflake](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | -| [snowflake-usage](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source | -| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | -| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source | +| Plugin Name | Install Command | Provides | +| ----------------------------------------------- | ---------------------------------------------------------- | ----------------------------------- | +| [file](./source_docs/file.md) | _included by default_ | File source and sink | +| [athena](./source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | +| [bigquery](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | +| [bigquery-usage](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source | +| [dbt](./source_docs/dbt.md) | _no additional dependencies_ | dbt source | +| [druid](./source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source | +| [feast](./source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source | +| [glue](./source_docs/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source | +| [hive](./source_docs/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source | +| [kafka](./source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source | +| [kafka-connect](./source_docs/kafka-connect.md) | `pip install 'acryl-datahub[kafka-connect]'` | Kafka connect source | +| [ldap](./source_docs/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | +| [looker](./source_docs/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source | +| [lookml](./source_docs/lookml.md) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ | +| [mongodb](./source_docs/mongodb.md) | `pip install 'acryl-datahub[mongodb]'` | MongoDB source | +| [mssql](./source_docs/mssql.md) | `pip install 'acryl-datahub[mssql]'` | SQL Server source | +| [mysql](./source_docs/mysql.md) | `pip install 'acryl-datahub[mysql]'` | MySQL source | +| [oracle](./source_docs/oracle.md) | `pip install 'acryl-datahub[oracle]'` | Oracle source | +| [postgres](./source_docs/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source | +| [redshift](./source_docs/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source | +| [sagemaker](./source_docs/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source | +| [snowflake](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | +| [snowflake-usage](./source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source | +| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | +| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source | Sinks diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 2aec46deade94..6ef504bd3b9f9 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -95,6 +95,21 @@ source: top_n_queries: 10 # number of queries to save for each table env: PROD + + # Additional options to pass to google.cloud.logging_v2.client.Client + extra_client_options: + + # To account for the possibility that the query event arrives after + # the read event in the audit logs, we wait for at least `query_log_delay` + # additional events to be processed before attempting to resolve BigQuery + # job information from the logs. If `query_log_delay` is None, it gets treated + # as an unlimited delay, which prioritizes correctness at the expense of memory usage. + query_log_delay: + + # Correction to pad start_time and end_time with. + # For handling the case where the read happens within our time range but the query + # completion event is delayed and happens after the configured end time. + max_query_duration: ``` :::note diff --git a/metadata-ingestion/source_docs/kafka-connect.md b/metadata-ingestion/source_docs/kafka-connect.md new file mode 100644 index 0000000000000..1b15a4b20269f --- /dev/null +++ b/metadata-ingestion/source_docs/kafka-connect.md @@ -0,0 +1,24 @@ +# Kafka Connect `kafka-connect` + +This plugin extracts the following: + +- Kafka Connect connector as individual `DataFlowSnapshotClass` entity +- Creating individual `DataJobSnapshotClass` entity using `{connector_name}:{source_dataset}` naming +- Lineage information between source database to Kafka topic + +```yml +source: + type: "kafka-connect" + config: + connect_uri: "http://localhost:8083" + cluster_name: "connect-cluster" + connector_patterns: + deny: + - ^denied-connector.* + allow: + - ^allowed-connector.* +``` + +Current limitations: + +- Currently works only for Debezium source connectors. diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/source_docs/kafka.md index 8fb57eda3b8f0..4dfdd901cffc6 100644 --- a/metadata-ingestion/source_docs/kafka.md +++ b/metadata-ingestion/source_docs/kafka.md @@ -36,28 +36,3 @@ source: The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively. For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). - -# Kafka Connect `kafka-connect` - -This plugin extracts the following: - -- Kafka Connect connector as individual `DataFlowSnapshotClass` entity -- Creating individual `DataJobSnapshotClass` entity using `{connector_name}:{source_dataset}` naming -- Lineage information between source database to Kafka topic - -```yml -source: - type: "kafka-connect" - config: - connect_uri: "http://localhost:8083" - cluster_name: "connect-cluster" - connector_patterns: - deny: - - ^denied-connector.* - allow: - - ^allowed-connector.* -``` - -Current limitations: - -- Currently works only for Debezium source connectors. diff --git a/metadata-ingestion/source_docs/ldap.md b/metadata-ingestion/source_docs/ldap.md index b1df1f385a99f..696ab8277f6af 100644 --- a/metadata-ingestion/source_docs/ldap.md +++ b/metadata-ingestion/source_docs/ldap.md @@ -15,9 +15,16 @@ source: ldap_server: ldap://localhost ldap_user: "cn=admin,dc=example,dc=org" ldap_password: "admin" + + # Extraction configuration. base_dn: "dc=example,dc=org" filter: "(objectClass=*)" # optional field + + # If set to true, any users without first and last names will be dropped. drop_missing_first_last_name: False # optional + + # For creating LDAP controls + page_size: # default is 20 ``` The `drop_missing_first_last_name` should be set to true if you've got many "headless" user LDAP accounts diff --git a/metadata-ingestion/source_docs/looker.md b/metadata-ingestion/source_docs/looker.md index c395781b7a2d9..20af147c0a4ef 100644 --- a/metadata-ingestion/source_docs/looker.md +++ b/metadata-ingestion/source_docs/looker.md @@ -16,9 +16,23 @@ source: client_id: # Your Looker API3 client ID client_secret: # Your Looker API3 client secret base_url: # The url to your Looker instance: https://company.looker.com:19999 or https://looker.company.com, or similar. - dashboard_pattern: # supports allow/deny regexes - chart_pattern: # supports allow/deny regexes + + platform_name: "looker" # Optional, default is "looker" actor: urn:li:corpuser:etl # Optional, defaults to urn:li:corpuser:etl + + # regex pattern to allow/deny dashboards + dashboard_pattern: + deny: + # ... + allow: + # ... + + # regex pattern to allow/deny charts + chart_pattern: + deny: + # ... + allow: + # ... + env: "PROD" # Optional, default is "PROD" - platform_name: "looker" # Optional, default is "looker" ``` diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/source_docs/lookml.md index 407656c583850..5591f32cd74c1 100644 --- a/metadata-ingestion/source_docs/lookml.md +++ b/metadata-ingestion/source_docs/lookml.md @@ -16,11 +16,25 @@ source: base_folder: /path/to/model/files # where the *.model.lkml and *.view.lkml files are stored connection_to_platform_map: # mappings between connection names in the model files to platform names connection_name: platform_name (or platform_name.database_name) # for ex. my_snowflake_conn: snowflake.my_database - model_pattern: {} - view_pattern: {} + + platform_name: "looker" # optional, default is "looker" + + # regex pattern to allow/deny models + model_pattern: + deny: + # ... + allow: + # ... + + # regex pattern to allow/deny views + view_pattern: + deny: + # ... + allow: + # ... + env: "PROD" # optional, default is "PROD" parse_table_names_from_sql: False # see note below - platform_name: "looker" # optional, default is "looker" ``` Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-metadata/) to try to parse the tables the diff --git a/metadata-ingestion/source_docs/mongodb.md b/metadata-ingestion/source_docs/mongodb.md index 9cb1140e7bced..142fb3cb5f88c 100644 --- a/metadata-ingestion/source_docs/mongodb.md +++ b/metadata-ingestion/source_docs/mongodb.md @@ -21,13 +21,26 @@ source: connect_uri: "mongodb://localhost" username: admin password: password - env: "PROD" # Optional, default is "PROD" + # used for PyMongo authMechanism: "DEFAULT" - options: {} - database_pattern: {} - collection_pattern: {} + + options: {} # kwargs to pass to pymongo.MongoClient enableSchemaInference: True - schemaSamplingSize: 1000 + schemaSamplingSize: 1000 # number of samples for determining schema useRandomSampling: True # whether to randomly sample docs for schema or just use the first ones, True by default - # database_pattern/collection_pattern are similar to schema_pattern/table_pattern from above + + env: "PROD" # Optional, default is "PROD" + + # regex pattern to allow/deny databases + database_pattern: + deny: + # ... + allow: + # ... + # regex pattern to allow/deny collections + collection_pattern: + deny: + # ... + allow: + # ... ``` diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md index 12133cf439398..a388231854c26 100644 --- a/metadata-ingestion/source_docs/mssql.md +++ b/metadata-ingestion/source_docs/mssql.md @@ -63,6 +63,7 @@ source: # already installed the Microsoft ODBC Driver for SQL Server. # See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 use_odbc: False + # args URL-encode and append to the mssql connection URL uri_args: {} ``` diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index a1badd26f0aa6..dc1e7dda50bef 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -14,14 +14,7 @@ source: username: user password: pass host_port: account_name - database_pattern: - # The escaping of the $ symbol helps us skip the environment variable substitution. - allow: - - ^MY_DEMO_DATA.* - - ^ANOTHER_DB_REGEX - deny: - - ^SNOWFLAKE\$ - - ^SNOWFLAKE_SAMPLE_DATA\$ + warehouse: "COMPUTE_WH" # optional role: "sysadmin" # optional @@ -33,6 +26,16 @@ source: options: # driver_option: some-option + # Regexe filters for databases to allow/deny + database_pattern: + # The escaping of the $ symbol helps us skip the environment variable substitution. + allow: + - ^MY_DEMO_DATA.* + - ^ANOTHER_DB_REGEX + deny: + - ^SNOWFLAKE\$ + - ^SNOWFLAKE_SAMPLE_DATA\$ + # Tables to allow/deny table_pattern: deny: diff --git a/metadata-ingestion/source_docs/superset.md b/metadata-ingestion/source_docs/superset.md index 5b83566edc960..d0910528d0cba 100644 --- a/metadata-ingestion/source_docs/superset.md +++ b/metadata-ingestion/source_docs/superset.md @@ -10,10 +10,12 @@ This plugin extracts the following: source: type: superset config: + connect_uri: http://localhost:8088 + username: user password: pass provider: db | ldap - connect_uri: http://localhost:8088 + env: "PROD" # Optional, default is "PROD" ``` From 9dc365fa7c9742b856af5f28199db79ebb337293 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 17:39:50 -0700 Subject: [PATCH 11/33] As above --- docs/features.md | 2 +- .../examples/recipes/mongodb_to_datahub.yml | 1 - metadata-ingestion/source_docs/athena.md | 30 ++++++++++++++++++- metadata-ingestion/source_docs/bigquery.md | 2 +- metadata-ingestion/source_docs/hive.md | 2 +- metadata-ingestion/source_docs/redshift.md | 2 +- 6 files changed, 33 insertions(+), 6 deletions(-) diff --git a/docs/features.md b/docs/features.md index e02c8dee47a36..01168a1109577 100644 --- a/docs/features.md +++ b/docs/features.md @@ -40,7 +40,7 @@ Our open sourcing [blog post](https://engineering.linkedin.com/blog/2020/open-so - **Schema history**: view and diff historic versions of schemas - **GraphQL**: visualization of GraphQL schemas -### Jos/flows [*coming soon*] +### Jobs/flows [*coming soon*] - **Search**: full-text & advanced search, search ranking - **Browse**: browsing through a configurable hierarchy - **Basic information**: diff --git a/metadata-ingestion/examples/recipes/mongodb_to_datahub.yml b/metadata-ingestion/examples/recipes/mongodb_to_datahub.yml index 931524093284a..6f1c3cae832a2 100644 --- a/metadata-ingestion/examples/recipes/mongodb_to_datahub.yml +++ b/metadata-ingestion/examples/recipes/mongodb_to_datahub.yml @@ -13,7 +13,6 @@ source: collection_pattern: {} enableSchemaInference: True schemaSamplingSize: 1000 - # database_pattern/collection_pattern are similar to schema_pattern/table_pattern from above sink: type: "datahub-rest" config: diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md index 7792511729487..665bf8a3905c2 100644 --- a/metadata-ingestion/source_docs/athena.md +++ b/metadata-ingestion/source_docs/athena.md @@ -24,5 +24,33 @@ source: # However, the athena driver will transparently fetch these results as you would expect from any other sql client. work_group: athena_workgroup # "primary" - # table_pattern/schema_pattern is same as above + + # Tables to allow/deny + table_pattern: + deny: + # Note that the deny patterns take precedence over the allow patterns. + - "bad_table" + - "junk_table" + # Can also be a regular expression + - "(old|used|deprecated)_table" + allow: + - "good_table" + - "excellent_table" + + # Although the 'table_pattern' enables you to skip everything from certain schemas, + # having another option to allow/deny on schema level is an optimization for the case when there is a large number + # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter + # them out afterwards via the table_pattern. + schema_pattern: + deny: + # ... + allow: + # ... + + # Same format as table_pattern, used for filtering views + view_pattern: + deny: + # ... + allow: + # ... ``` diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 6ef504bd3b9f9..a92014cd3bebe 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -18,7 +18,7 @@ source: # Many of these options are specific to the underlying database driver, so that library's # documentation will be a good reference for what is supported. To find which dialect is likely # in use, consult this table: https://docs.sqlalchemy.org/en/14/dialects/index.html. - options: # options is same as above + options: # See https://github.com/mxmzdlv/pybigquery#authentication for details. credentials_path: "/path/to/keyfile.json" # optional diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md index e448ebba63fce..87e0e137bab28 100644 --- a/metadata-ingestion/source_docs/hive.md +++ b/metadata-ingestion/source_docs/hive.md @@ -79,7 +79,7 @@ source: connect_args: http_path: "/hive2" auth: BASIC - # table_pattern/schema_pattern is same as above + # ... table_pattern/schema_pattern ```
diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md index 609dfd8d87715..a905d6771209b 100644 --- a/metadata-ingestion/source_docs/redshift.md +++ b/metadata-ingestion/source_docs/redshift.md @@ -68,7 +68,7 @@ See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure- source: type: redshift config: - # username, password, database, etc are all the same as above + # username, password, database, etc... host_port: my-proxy-hostname:5439 options: connect_args: From 9afa393bf75a3387a40fe0ab23ec2f0258d70efd Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 17:48:03 -0700 Subject: [PATCH 12/33] Typo fixes --- metadata-ingestion/source_docs/bigquery.md | 2 +- metadata-ingestion/source_docs/snowflake.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index a92014cd3bebe..5c5839e48260f 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -114,6 +114,6 @@ source: :::note -This source only does usage statistics. To get the tables, views, and schemas in your BigQuery project, use the `bigquery` source. +This source only does usage statistics. To get the tables, views, and schemas in your BigQuery project, use the `bigquery` source described above. ::: diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index dc1e7dda50bef..25a709bdbc03f 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -71,7 +71,7 @@ source: :::tip -You can also get fine-grained usage statistics for Snowflake using the `snowflake-usage` source. +You can also get fine-grained usage statistics for Snowflake using the `snowflake-usage` source described below. ::: @@ -106,6 +106,6 @@ source: :::note -This source only does usage statistics. To get the tables, views, and schemas in your Snowflake warehouse, ingest using the `snowflake` source. +This source only does usage statistics. To get the tables, views, and schemas in your Snowflake warehouse, ingest using the `snowflake` source described above. ::: From c6388cba34d4ff5adc31579bed0d22406272b504 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 17:58:53 -0700 Subject: [PATCH 13/33] More typo fixes --- docs-website/generateDocsDir.ts | 4 +++- metadata-ingestion/README.md | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index 180d3c97c5548..82c6568247ea3 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -160,7 +160,9 @@ function markdown_guess_title( const headers = contents.content.match(/^# (.+)$/gm); if (!headers) { - throw new Error(`${filepath} must have at least one h1 header`); + throw new Error( + `${filepath} must have at least one h1 header for setting the title` + ); } if (headers.length > 1 && contents.content.indexOf("```") < 0) { diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index d13f846479950..5d408ef45a400 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -145,7 +145,7 @@ Running a recipe is quite easy. datahub ingest -c ./examples/recipes/mssql_to_datahub.yml ``` -A number of recipes are included in the [examples/recipes](./examples/recipes) directory. See also pages described in the [table of plugins](#installing-plugins) for more context on recipe options for each source and sink. +A number of recipes are included in the [examples/recipes](./examples/recipes) directory. For full info and context on each source and sink, see the pages described in the [table of plugins](#installing-plugins). ## Transformations From 8588cb97729460251d680f620cf9ad0a66722824 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 18:06:41 -0700 Subject: [PATCH 14/33] More consistency fixes --- metadata-ingestion/sink_docs/datahub.md | 2 +- metadata-ingestion/sink_docs/file.md | 2 +- metadata-ingestion/source_docs/bigquery.md | 2 ++ metadata-ingestion/source_docs/feast.md | 2 +- metadata-ingestion/source_docs/file.md | 2 +- metadata-ingestion/source_docs/snowflake.md | 4 +++- 6 files changed, 9 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md index f77062d7866ae..1341ed8700c65 100644 --- a/metadata-ingestion/sink_docs/datahub.md +++ b/metadata-ingestion/sink_docs/datahub.md @@ -18,7 +18,7 @@ To install this plugin, run `pip install 'acryl-datahub[datahub-kafka]'`. Pushes metadata to DataHub by publishing messages to Kafka. The advantage of the Kafka-based interface is that it's asynchronous and can handle higher throughput. This requires the -Datahub mce-consumer container to be running. +DataHub mce-consumer container to be running. ```yml sink: diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/file.md index c7cbcc47d43b2..7c906f991dc5d 100644 --- a/metadata-ingestion/sink_docs/file.md +++ b/metadata-ingestion/sink_docs/file.md @@ -2,7 +2,7 @@ Outputs metadata to a file. This can be used to decouple metadata sourcing from the process of pushing it into DataHub, and is particularly useful for debugging purposes. -Note that the file source can read files generated by this sink. +Note that the [file source]("../source_docs/file") can read files generated by this sink. ```yml sink: diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 5c5839e48260f..3c110938ff458 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -65,6 +65,8 @@ You can also get fine-grained usage statistics for BigQuery using the `bigquery- To install this plugin, run `pip install 'acryl-datahub[bigquery-usage]'`. +This plugin extracts the following: + - Fetch a list of queries issued - Fetch a list of tables and columns accessed - Aggregate these statistics into buckets, by day or hour granularity diff --git a/metadata-ingestion/source_docs/feast.md b/metadata-ingestion/source_docs/feast.md index 46b16b41be223..a2a199fc71dc1 100644 --- a/metadata-ingestion/source_docs/feast.md +++ b/metadata-ingestion/source_docs/feast.md @@ -12,7 +12,7 @@ This plugin extracts the following: - Column types associated with each feature and entity Note: this uses a separate Docker container to extract Feast's metadata into a JSON file, which is then -parsed to DataHub's native objects. This was done because of a dependency conflict in the `feast` module. +parsed to DataHub's native objects. This separation was performed because of a dependency conflict in the `feast` module. ```yml source: diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/source_docs/file.md index 0b1ba7c504dad..268cc5084ff6a 100644 --- a/metadata-ingestion/source_docs/file.md +++ b/metadata-ingestion/source_docs/file.md @@ -1,6 +1,6 @@ # File -This plugin pulls metadata from a previously generated file. The file sink +This plugin pulls metadata from a previously generated file. The [file sink](../sink_docs/file) can produce such files, and a number of samples are included in the [examples/mce_files](../examples/mce_files) directory. diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index 25a709bdbc03f..a8286803dfaef 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -75,10 +75,12 @@ You can also get fine-grained usage statistics for Snowflake using the `snowflak ::: -# Snowflake Usage Stats `snowflake-usage` +# Snowflake Usage Stats To install this plugin, run `pip install 'acryl-datahub[snowflake-usage]'`. +This plugin extracts the following: + - Fetch a list of queries issued - Fetch a list of tables and columns accessed (excludes views) - Aggregate these statistics into buckets, by day or hour granularity From 63691dd15245486ef180435bf2afa97021783aa5 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 27 Jul 2021 18:15:30 -0700 Subject: [PATCH 15/33] Fix broken links --- metadata-ingestion/sink_docs/datahub.md | 2 +- metadata-ingestion/sink_docs/file.md | 2 +- metadata-ingestion/source_docs/file.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md index 1341ed8700c65..a051422815909 100644 --- a/metadata-ingestion/sink_docs/datahub.md +++ b/metadata-ingestion/sink_docs/datahub.md @@ -1,6 +1,6 @@ # DataHub Rest -To install this plugin, run `pip install 'acryl-datahub[datahub-reset]'`. +To install this plugin, run `pip install 'acryl-datahub[datahub-rest]'`. Pushes metadata to DataHub using the GMA rest API. The advantage of the rest-based interface is that any errors can immediately be reported. diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/file.md index 7c906f991dc5d..d1fbab953c6a5 100644 --- a/metadata-ingestion/sink_docs/file.md +++ b/metadata-ingestion/sink_docs/file.md @@ -2,7 +2,7 @@ Outputs metadata to a file. This can be used to decouple metadata sourcing from the process of pushing it into DataHub, and is particularly useful for debugging purposes. -Note that the [file source]("../source_docs/file") can read files generated by this sink. +Note that the [file source](../source_docs/file.md) can read files generated by this sink. ```yml sink: diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/source_docs/file.md index 268cc5084ff6a..56e969865eee2 100644 --- a/metadata-ingestion/source_docs/file.md +++ b/metadata-ingestion/source_docs/file.md @@ -1,6 +1,6 @@ # File -This plugin pulls metadata from a previously generated file. The [file sink](../sink_docs/file) +This plugin pulls metadata from a previously generated file. The [file sink](../sink_docs/file.md) can produce such files, and a number of samples are included in the [examples/mce_files](../examples/mce_files) directory. From eef2a62874d21afce1c2b709549fc484e66e661d Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 2 Aug 2021 17:50:49 -0400 Subject: [PATCH 16/33] Note on allow/deny --- metadata-ingestion/source_docs/athena.md | 8 +++++++- metadata-ingestion/source_docs/bigquery.md | 4 +++- metadata-ingestion/source_docs/dbt.md | 2 +- metadata-ingestion/source_docs/druid.md | 4 +++- metadata-ingestion/source_docs/hive.md | 6 ++++-- metadata-ingestion/source_docs/looker.md | 4 ++-- metadata-ingestion/source_docs/lookml.md | 4 ++-- metadata-ingestion/source_docs/mongodb.md | 4 ++-- metadata-ingestion/source_docs/mssql.md | 6 ++++-- metadata-ingestion/source_docs/mysql.md | 6 ++++-- metadata-ingestion/source_docs/oracle.md | 6 ++++-- metadata-ingestion/source_docs/postgres.md | 6 ++++-- metadata-ingestion/source_docs/redshift.md | 6 ++++-- metadata-ingestion/source_docs/snowflake.md | 8 +++++--- metadata-ingestion/source_docs/sqlalchemy.md | 6 ++++-- 15 files changed, 53 insertions(+), 27 deletions(-) diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md index 665bf8a3905c2..b192359077368 100644 --- a/metadata-ingestion/source_docs/athena.md +++ b/metadata-ingestion/source_docs/athena.md @@ -25,7 +25,7 @@ source: work_group: athena_workgroup # "primary" - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -41,6 +41,8 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... @@ -54,3 +56,7 @@ source: allow: # ... ``` + +## Questions + +If you've got any questions on configuring this source diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 3c110938ff458..d00fd8d8f37aa 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -22,7 +22,7 @@ source: # See https://github.com/mxmzdlv/pybigquery#authentication for details. credentials_path: "/path/to/keyfile.json" # optional - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -38,6 +38,8 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/dbt.md b/metadata-ingestion/source_docs/dbt.md index aee7c4736cf08..1232c50f80a74 100644 --- a/metadata-ingestion/source_docs/dbt.md +++ b/metadata-ingestion/source_docs/dbt.md @@ -37,7 +37,7 @@ source: # (otherwise, only includes a simple list of tables) load_schemas: True or False - # regex pattern to allow/deny nodes + # Regex pattern to allow/deny nodes. If left blank, will ingest all. node_type_pattern: # optional deny: - ^test.* diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md index df43204e2eb40..cce79550062b1 100644 --- a/metadata-ingestion/source_docs/druid.md +++ b/metadata-ingestion/source_docs/druid.md @@ -26,7 +26,7 @@ source: options: # driver_option: some-option - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -42,6 +42,8 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md index 4e1a9895269d3..1387d79f3dd2d 100644 --- a/metadata-ingestion/source_docs/hive.md +++ b/metadata-ingestion/source_docs/hive.md @@ -31,7 +31,7 @@ source: options: # driver_option: some-option - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -47,13 +47,15 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/looker.md b/metadata-ingestion/source_docs/looker.md index 20af147c0a4ef..668e784d33d41 100644 --- a/metadata-ingestion/source_docs/looker.md +++ b/metadata-ingestion/source_docs/looker.md @@ -20,14 +20,14 @@ source: platform_name: "looker" # Optional, default is "looker" actor: urn:li:corpuser:etl # Optional, defaults to urn:li:corpuser:etl - # regex pattern to allow/deny dashboards + # Regex pattern to allow/deny dashboards. If left blank, will ingest all. dashboard_pattern: deny: # ... allow: # ... - # regex pattern to allow/deny charts + # Regex pattern to allow/deny charts. If left blank, will ingest all. chart_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/source_docs/lookml.md index 5591f32cd74c1..51e818a604cc6 100644 --- a/metadata-ingestion/source_docs/lookml.md +++ b/metadata-ingestion/source_docs/lookml.md @@ -19,14 +19,14 @@ source: platform_name: "looker" # optional, default is "looker" - # regex pattern to allow/deny models + # Regex pattern to allow/deny models. If left blank, will ingest all. model_pattern: deny: # ... allow: # ... - # regex pattern to allow/deny views + # Regex pattern to allow/deny views. If left blank, will ingest all. view_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/mongodb.md b/metadata-ingestion/source_docs/mongodb.md index 142fb3cb5f88c..298a1a8b81f6a 100644 --- a/metadata-ingestion/source_docs/mongodb.md +++ b/metadata-ingestion/source_docs/mongodb.md @@ -31,13 +31,13 @@ source: env: "PROD" # Optional, default is "PROD" - # regex pattern to allow/deny databases + # Regex pattern to allow/deny databases. If left blank, will ingest all. database_pattern: deny: # ... allow: # ... - # regex pattern to allow/deny collections + # Regex pattern to allow/deny collections. If left blank, will ingest all. collection_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md index a388231854c26..8ec73af926275 100644 --- a/metadata-ingestion/source_docs/mssql.md +++ b/metadata-ingestion/source_docs/mssql.md @@ -26,7 +26,7 @@ source: options: charset: "utf8" - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -43,13 +43,15 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/source_docs/mysql.md index 2ede14967c1d0..b6a05731bfaf3 100644 --- a/metadata-ingestion/source_docs/mysql.md +++ b/metadata-ingestion/source_docs/mysql.md @@ -24,7 +24,7 @@ source: options: # driver_option: some-option - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -40,13 +40,15 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/oracle.md b/metadata-ingestion/source_docs/oracle.md index 6550d74c4ea7f..284f3f1a2ba1f 100644 --- a/metadata-ingestion/source_docs/oracle.md +++ b/metadata-ingestion/source_docs/oracle.md @@ -30,7 +30,7 @@ source: options: # driver_option: some-option - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -46,13 +46,15 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/postgres.md b/metadata-ingestion/source_docs/postgres.md index 94bcacaa49775..94aae26fb304d 100644 --- a/metadata-ingestion/source_docs/postgres.md +++ b/metadata-ingestion/source_docs/postgres.md @@ -27,7 +27,7 @@ source: options: # driver_option: some-option - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -43,13 +43,15 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md index a905d6771209b..0344536adfc25 100644 --- a/metadata-ingestion/source_docs/redshift.md +++ b/metadata-ingestion/source_docs/redshift.md @@ -25,7 +25,7 @@ source: options: # driver_option: some-option - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -41,13 +41,15 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index a8286803dfaef..242623d95c565 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -26,7 +26,7 @@ source: options: # driver_option: some-option - # Regexe filters for databases to allow/deny + # Regex filters for databases to allow/deny. If left blank, will ingest all. database_pattern: # The escaping of the $ symbol helps us skip the environment variable substitution. allow: @@ -36,7 +36,7 @@ source: - ^SNOWFLAKE\$ - ^SNOWFLAKE_SAMPLE_DATA\$ - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -52,13 +52,15 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/sqlalchemy.md b/metadata-ingestion/source_docs/sqlalchemy.md index ad20ed77bfc4c..1b3d94b8af418 100644 --- a/metadata-ingestion/source_docs/sqlalchemy.md +++ b/metadata-ingestion/source_docs/sqlalchemy.md @@ -26,7 +26,7 @@ source: options: # driver_option: some-option - # Tables to allow/deny + # Tables to allow/deny. If left blank, will ingest all. table_pattern: deny: # Note that the deny patterns take precedence over the allow patterns. @@ -42,13 +42,15 @@ source: # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. + + # If left blank, will ingest all. schema_pattern: deny: # ... allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... From bee872f3a032b921714e132fd616f975a7fb171e Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 2 Aug 2021 17:53:13 -0400 Subject: [PATCH 17/33] Add questions section --- metadata-ingestion/sink_docs/console.md | 4 ++++ metadata-ingestion/sink_docs/datahub.md | 4 ++++ metadata-ingestion/sink_docs/file.md | 4 ++++ metadata-ingestion/source_docs/athena.md | 2 +- metadata-ingestion/source_docs/bigquery.md | 4 ++++ metadata-ingestion/source_docs/dbt.md | 4 ++++ metadata-ingestion/source_docs/druid.md | 4 ++++ metadata-ingestion/source_docs/feast.md | 4 ++++ metadata-ingestion/source_docs/file.md | 4 ++++ metadata-ingestion/source_docs/glue.md | 4 ++++ metadata-ingestion/source_docs/hive.md | 4 ++++ metadata-ingestion/source_docs/kafka-connect.md | 4 ++++ metadata-ingestion/source_docs/kafka.md | 4 ++++ metadata-ingestion/source_docs/ldap.md | 4 ++++ metadata-ingestion/source_docs/looker.md | 4 ++++ metadata-ingestion/source_docs/lookml.md | 4 ++++ metadata-ingestion/source_docs/mongodb.md | 4 ++++ metadata-ingestion/source_docs/mssql.md | 4 ++++ metadata-ingestion/source_docs/mysql.md | 4 ++++ metadata-ingestion/source_docs/oracle.md | 4 ++++ metadata-ingestion/source_docs/postgres.md | 4 ++++ metadata-ingestion/source_docs/redshift.md | 4 ++++ metadata-ingestion/source_docs/sagemaker.md | 4 ++++ metadata-ingestion/source_docs/snowflake.md | 4 ++++ metadata-ingestion/source_docs/sql_profiles.md | 6 +++++- metadata-ingestion/source_docs/sqlalchemy.md | 4 ++++ metadata-ingestion/source_docs/superset.md | 4 ++++ 27 files changed, 106 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/sink_docs/console.md b/metadata-ingestion/sink_docs/console.md index d8bbf7c44c9bd..cc4cb5f126662 100644 --- a/metadata-ingestion/sink_docs/console.md +++ b/metadata-ingestion/sink_docs/console.md @@ -6,3 +6,7 @@ Simply prints each metadata event to stdout. Useful for experimentation and debu sink: type: "console" ``` + +## Questions + +If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md index a051422815909..b32b4c9566647 100644 --- a/metadata-ingestion/sink_docs/datahub.md +++ b/metadata-ingestion/sink_docs/datahub.md @@ -34,3 +34,7 @@ sink: The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively. For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). + +## Questions + +If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/file.md index d1fbab953c6a5..dc8a43b8049f2 100644 --- a/metadata-ingestion/sink_docs/file.md +++ b/metadata-ingestion/sink_docs/file.md @@ -10,3 +10,7 @@ sink: config: filename: ./path/to/mce/file.json ``` + +## Questions + +If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md index b192359077368..9ebd094c11ad9 100644 --- a/metadata-ingestion/source_docs/athena.md +++ b/metadata-ingestion/source_docs/athena.md @@ -59,4 +59,4 @@ source: ## Questions -If you've got any questions on configuring this source +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index d00fd8d8f37aa..3d6cc01e5cfd0 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -121,3 +121,7 @@ source: This source only does usage statistics. To get the tables, views, and schemas in your BigQuery project, use the `bigquery` source described above. ::: + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/dbt.md b/metadata-ingestion/source_docs/dbt.md index 1232c50f80a74..49bcfa3594b29 100644 --- a/metadata-ingestion/source_docs/dbt.md +++ b/metadata-ingestion/source_docs/dbt.md @@ -46,3 +46,7 @@ source: ``` Note: when `load_schemas` is False, models that use [identifiers](https://docs.getdbt.com/reference/resource-properties/identifier) to reference their source tables are ingested using the model identifier as the model name to preserve the lineage. + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md index cce79550062b1..89ccd12b84afd 100644 --- a/metadata-ingestion/source_docs/druid.md +++ b/metadata-ingestion/source_docs/druid.md @@ -60,3 +60,7 @@ source: include_views: True # whether to include views, defaults to True include_tables: True # whether to include views, defaults to True ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/feast.md b/metadata-ingestion/source_docs/feast.md index a2a199fc71dc1..48efed0443ddb 100644 --- a/metadata-ingestion/source_docs/feast.md +++ b/metadata-ingestion/source_docs/feast.md @@ -22,3 +22,7 @@ source: env: "PROD" # Optional, default is "PROD" use_local_build: False # Whether to build Feast ingestion image locally, default is False ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/source_docs/file.md index 56e969865eee2..826d6cf55abf2 100644 --- a/metadata-ingestion/source_docs/file.md +++ b/metadata-ingestion/source_docs/file.md @@ -10,3 +10,7 @@ source: config: filename: ./path/to/mce/file.json ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md index a51add12c54be..3f7fadb63ae40 100644 --- a/metadata-ingestion/source_docs/glue.md +++ b/metadata-ingestion/source_docs/glue.md @@ -44,3 +44,7 @@ source: allow: # ... ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md index 1387d79f3dd2d..95df66613de52 100644 --- a/metadata-ingestion/source_docs/hive.md +++ b/metadata-ingestion/source_docs/hive.md @@ -85,3 +85,7 @@ source: ```
+ +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/kafka-connect.md b/metadata-ingestion/source_docs/kafka-connect.md index 1b15a4b20269f..00f887aaa14d1 100644 --- a/metadata-ingestion/source_docs/kafka-connect.md +++ b/metadata-ingestion/source_docs/kafka-connect.md @@ -22,3 +22,7 @@ source: Current limitations: - Currently works only for Debezium source connectors. + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/source_docs/kafka.md index 4dfdd901cffc6..1a46799664173 100644 --- a/metadata-ingestion/source_docs/kafka.md +++ b/metadata-ingestion/source_docs/kafka.md @@ -36,3 +36,7 @@ source: The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively. For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/ldap.md b/metadata-ingestion/source_docs/ldap.md index 696ab8277f6af..aee334a67ac1b 100644 --- a/metadata-ingestion/source_docs/ldap.md +++ b/metadata-ingestion/source_docs/ldap.md @@ -30,3 +30,7 @@ source: The `drop_missing_first_last_name` should be set to true if you've got many "headless" user LDAP accounts for devices or services should be excluded when they do not contain a first and last name. This will only impact the ingestion of LDAP users, while LDAP groups will be unaffected by this config option. + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/looker.md b/metadata-ingestion/source_docs/looker.md index 668e784d33d41..34e7c15410b2e 100644 --- a/metadata-ingestion/source_docs/looker.md +++ b/metadata-ingestion/source_docs/looker.md @@ -36,3 +36,7 @@ source: env: "PROD" # Optional, default is "PROD" ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/source_docs/lookml.md index 51e818a604cc6..7af19441ad4a5 100644 --- a/metadata-ingestion/source_docs/lookml.md +++ b/metadata-ingestion/source_docs/lookml.md @@ -41,3 +41,7 @@ Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-meta views depends on. As these SQL's can be complicated, and the package doesn't official support all the SQL dialects that Looker supports, the result might not be correct. This parsing is disabled by default, but can be enabled by setting `parse_table_names_from_sql: True`. + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/mongodb.md b/metadata-ingestion/source_docs/mongodb.md index 298a1a8b81f6a..13c901b509de6 100644 --- a/metadata-ingestion/source_docs/mongodb.md +++ b/metadata-ingestion/source_docs/mongodb.md @@ -44,3 +44,7 @@ source: allow: # ... ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md index 8ec73af926275..b3480df136ef4 100644 --- a/metadata-ingestion/source_docs/mssql.md +++ b/metadata-ingestion/source_docs/mssql.md @@ -96,3 +96,7 @@ source: ```
+ +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/source_docs/mysql.md index b6a05731bfaf3..31aeee8c7da45 100644 --- a/metadata-ingestion/source_docs/mysql.md +++ b/metadata-ingestion/source_docs/mysql.md @@ -58,3 +58,7 @@ source: include_views: True # whether to include views, defaults to True include_tables: True # whether to include views, defaults to True ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/oracle.md b/metadata-ingestion/source_docs/oracle.md index 284f3f1a2ba1f..2792ba477c106 100644 --- a/metadata-ingestion/source_docs/oracle.md +++ b/metadata-ingestion/source_docs/oracle.md @@ -64,3 +64,7 @@ source: include_views: True # whether to include views, defaults to True include_tables: True # whether to include views, defaults to True ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/postgres.md b/metadata-ingestion/source_docs/postgres.md index 94aae26fb304d..605d08501f669 100644 --- a/metadata-ingestion/source_docs/postgres.md +++ b/metadata-ingestion/source_docs/postgres.md @@ -61,3 +61,7 @@ source: include_views: True # whether to include views, defaults to True include_tables: True # whether to include views, defaults to True ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md index 0344536adfc25..614818c78e641 100644 --- a/metadata-ingestion/source_docs/redshift.md +++ b/metadata-ingestion/source_docs/redshift.md @@ -79,3 +79,7 @@ source: ```
+ +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/sagemaker.md b/metadata-ingestion/source_docs/sagemaker.md index f6ea7009b2448..3e1ec47419c05 100644 --- a/metadata-ingestion/source_docs/sagemaker.md +++ b/metadata-ingestion/source_docs/sagemaker.md @@ -33,3 +33,7 @@ source: training: True transform: True ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index 242623d95c565..1f309a05b1dea 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -113,3 +113,7 @@ source: This source only does usage statistics. To get the tables, views, and schemas in your Snowflake warehouse, ingest using the `snowflake` source described above. ::: + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/sql_profiles.md b/metadata-ingestion/source_docs/sql_profiles.md index dde978dd781f2..08f7dbd49160e 100644 --- a/metadata-ingestion/source_docs/sql_profiles.md +++ b/metadata-ingestion/source_docs/sql_profiles.md @@ -57,4 +57,8 @@ While we've done our best to limit the expensiveness of the queries the profiler should be prudent about the set of tables profiling is enabled on or the frequency of the profiling runs. -::: \ No newline at end of file +::: + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/sqlalchemy.md b/metadata-ingestion/source_docs/sqlalchemy.md index 1b3d94b8af418..f7bdf1523fc67 100644 --- a/metadata-ingestion/source_docs/sqlalchemy.md +++ b/metadata-ingestion/source_docs/sqlalchemy.md @@ -60,3 +60,7 @@ source: include_views: True # whether to include views, defaults to True include_tables: True # whether to include views, defaults to True ``` + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/superset.md b/metadata-ingestion/source_docs/superset.md index d0910528d0cba..bc67ae5e67234 100644 --- a/metadata-ingestion/source_docs/superset.md +++ b/metadata-ingestion/source_docs/superset.md @@ -20,3 +20,7 @@ source: ``` See documentation for superset's `/security/login` at https://superset.apache.org/docs/rest-api for more details on superset's login api. + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! From 6ffd8a1c9269960e16ff43665be8097b82e67410 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 3 Aug 2021 13:05:02 -0400 Subject: [PATCH 18/33] Fix inconsistencies --- metadata-ingestion/sink_docs/datahub.md | 6 ++++-- metadata-ingestion/source_docs/athena.md | 2 +- metadata-ingestion/source_docs/bigquery.md | 6 +++--- metadata-ingestion/source_docs/druid.md | 8 +++----- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md index b32b4c9566647..d286d5fc7ea73 100644 --- a/metadata-ingestion/sink_docs/datahub.md +++ b/metadata-ingestion/sink_docs/datahub.md @@ -1,4 +1,6 @@ -# DataHub Rest +# DataHub + +## DataHub Rest To install this plugin, run `pip install 'acryl-datahub[datahub-rest]'`. @@ -12,7 +14,7 @@ sink: server: "http://localhost:8080" ``` -# DataHub Kafka +## DataHub Kafka To install this plugin, run `pip install 'acryl-datahub[datahub-kafka]'`. diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md index 9ebd094c11ad9..39b1b21b94d00 100644 --- a/metadata-ingestion/source_docs/athena.md +++ b/metadata-ingestion/source_docs/athena.md @@ -49,7 +49,7 @@ source: allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 3d6cc01e5cfd0..ac0a56e700320 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -1,4 +1,4 @@ -# Google BigQuery +# BigQuery To install this plugin, run `pip install 'acryl-datahub[bigquery]'`. @@ -46,7 +46,7 @@ source: allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... @@ -63,7 +63,7 @@ You can also get fine-grained usage statistics for BigQuery using the `bigquery- ::: -# Google BigQuery Usage Stats +# BigQuery Usage Stats To install this plugin, run `pip install 'acryl-datahub[bigquery-usage]'`. diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md index 89ccd12b84afd..7031e22fcf379 100644 --- a/metadata-ingestion/source_docs/druid.md +++ b/metadata-ingestion/source_docs/druid.md @@ -7,9 +7,7 @@ This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table -**Note** It is important to define a explicitly define deny schema pattern for internal druid databases (lookup & sys) -if adding a schema pattern otherwise the crawler may crash before processing relevant databases. -This deny pattern is defined by default but is overriden by user-submitted configurations +**Note**: It is important to explicitly define the deny schema pattern for internal Druid databases (lookup & sys) if adding a schema pattern. Otherwise, the crawler may crash before processing relevant databases. This deny pattern is defined by default but is overriden by user-submitted configurations. ```yml source: @@ -46,11 +44,11 @@ source: # If left blank, will ingest all. schema_pattern: deny: - # ... + - "^(lookup|sys).*" # default, ignores internal Druid databases (see note below) allow: # ... - # Same format as table_pattern, used for filtering views + # Same format as table_pattern, used for filtering views. If left blank, will ingest all. view_pattern: deny: # ... From 8a4de6d4ebdf0439585dee865447c62478755ab6 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 3 Aug 2021 16:29:03 -0400 Subject: [PATCH 19/33] Begin separation of quickstart and config details --- metadata-ingestion/sink_docs/console.md | 12 ++++++ metadata-ingestion/sink_docs/datahub.md | 20 ++++++++++ metadata-ingestion/sink_docs/file.md | 12 ++++++ metadata-ingestion/source_docs/athena.md | 12 +++++- metadata-ingestion/source_docs/bigquery.md | 40 ++++++++++++++----- metadata-ingestion/source_docs/dbt.md | 12 ++++++ metadata-ingestion/source_docs/druid.md | 10 +++++ metadata-ingestion/source_docs/feast.md | 10 +++++ metadata-ingestion/source_docs/file.md | 12 ++++++ metadata-ingestion/source_docs/glue.md | 10 +++++ metadata-ingestion/source_docs/hive.md | 10 +++++ .../source_docs/kafka-connect.md | 14 ++++++- metadata-ingestion/source_docs/kafka.md | 10 +++++ metadata-ingestion/source_docs/ldap.md | 10 +++++ metadata-ingestion/source_docs/looker.md | 10 +++++ metadata-ingestion/source_docs/lookml.md | 10 +++++ metadata-ingestion/source_docs/mongodb.md | 10 +++++ metadata-ingestion/source_docs/mssql.md | 12 +++++- metadata-ingestion/source_docs/mysql.md | 10 +++++ metadata-ingestion/source_docs/oracle.md | 10 +++++ metadata-ingestion/source_docs/postgres.md | 10 +++++ metadata-ingestion/source_docs/redshift.md | 10 +++++ metadata-ingestion/source_docs/sagemaker.md | 12 +++++- metadata-ingestion/source_docs/snowflake.md | 40 ++++++++++++++----- .../source_docs/sql_profiles.md | 26 ++++++++---- metadata-ingestion/source_docs/sqlalchemy.md | 12 +++++- metadata-ingestion/source_docs/superset.md | 12 +++++- 27 files changed, 343 insertions(+), 35 deletions(-) diff --git a/metadata-ingestion/sink_docs/console.md b/metadata-ingestion/sink_docs/console.md index cc4cb5f126662..edad962582533 100644 --- a/metadata-ingestion/sink_docs/console.md +++ b/metadata-ingestion/sink_docs/console.md @@ -1,12 +1,24 @@ # Console +## Setup + +Works with `acryl-datahub` out of the box. + +## Capabilities + Simply prints each metadata event to stdout. Useful for experimentation and debugging purposes. +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml sink: type: "console" ``` +## Config details + ## Questions If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md index d286d5fc7ea73..a19488d43eae4 100644 --- a/metadata-ingestion/sink_docs/datahub.md +++ b/metadata-ingestion/sink_docs/datahub.md @@ -2,11 +2,19 @@ ## DataHub Rest +### Setup + To install this plugin, run `pip install 'acryl-datahub[datahub-rest]'`. +### Capabilities + Pushes metadata to DataHub using the GMA rest API. The advantage of the rest-based interface is that any errors can immediately be reported. +### Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml sink: type: "datahub-rest" @@ -14,14 +22,24 @@ sink: server: "http://localhost:8080" ``` +### Config details + ## DataHub Kafka +### Setup + To install this plugin, run `pip install 'acryl-datahub[datahub-kafka]'`. +### Capabilities + Pushes metadata to DataHub by publishing messages to Kafka. The advantage of the Kafka-based interface is that it's asynchronous and can handle higher throughput. This requires the DataHub mce-consumer container to be running. +### Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml sink: type: "datahub-kafka" @@ -33,6 +51,8 @@ sink: schema_registry_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.schema_registry.SchemaRegistryClient ``` +### Config details + The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively. For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/file.md index dc8a43b8049f2..2e2f95ef37f9f 100644 --- a/metadata-ingestion/sink_docs/file.md +++ b/metadata-ingestion/sink_docs/file.md @@ -1,9 +1,19 @@ # File +## Setup + +Works with `acryl-datahub` out of the box. + +## Capabilities + Outputs metadata to a file. This can be used to decouple metadata sourcing from the process of pushing it into DataHub, and is particularly useful for debugging purposes. Note that the [file source](../source_docs/file.md) can read files generated by this sink. +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml sink: type: file @@ -11,6 +21,8 @@ sink: filename: ./path/to/mce/file.json ``` +## Config details + ## Questions If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md index 39b1b21b94d00..d41be109d8bbd 100644 --- a/metadata-ingestion/source_docs/athena.md +++ b/metadata-ingestion/source_docs/athena.md @@ -1,12 +1,20 @@ -# AWS Athena +# Athena + +## Setup To install this plugin, run `pip install 'acryl-datahub[athena]'`. +## Capabilities + This plugin extracts the following: - List of databases and tables - Column types associated with each table +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: athena @@ -57,6 +65,8 @@ source: # ... ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index ac0a56e700320..270de3b16613d 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -1,12 +1,26 @@ # BigQuery +## Setup + To install this plugin, run `pip install 'acryl-datahub[bigquery]'`. +## Capabilities + This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table +:::tip + +You can also get fine-grained usage statistics for BigQuery using the `bigquery-usage` source described below. + +::: + +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: bigquery @@ -57,16 +71,16 @@ source: include_tables: True # whether to include views, defaults to True ``` -:::tip - -You can also get fine-grained usage statistics for BigQuery using the `bigquery-usage` source described below. - -::: +## Config details # BigQuery Usage Stats +## Setup + To install this plugin, run `pip install 'acryl-datahub[bigquery-usage]'`. +## Capabilities + This plugin extracts the following: - Fetch a list of queries issued @@ -80,6 +94,16 @@ Note: the client must have one of the following OAuth scopes, and should be auth - https://www.googleapis.com/auth/cloud-platform.read-only - https://www.googleapis.com/auth/cloud-platform +:::note + +This source only does usage statistics. To get the tables, views, and schemas in your BigQuery project, use the `bigquery` source described above. + +::: + +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: bigquery-usage @@ -116,11 +140,7 @@ source: max_query_duration: ``` -:::note - -This source only does usage statistics. To get the tables, views, and schemas in your BigQuery project, use the `bigquery` source described above. - -::: +## Config details ## Questions diff --git a/metadata-ingestion/source_docs/dbt.md b/metadata-ingestion/source_docs/dbt.md index 49bcfa3594b29..155f0caa1d670 100644 --- a/metadata-ingestion/source_docs/dbt.md +++ b/metadata-ingestion/source_docs/dbt.md @@ -1,5 +1,11 @@ # dbt +## Setup + +Works with `acryl-datahub` out of the box. + +## Capabilities + This plugin pulls metadata from dbt's artifact files: - [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json) @@ -19,6 +25,10 @@ This plugin pulls metadata from dbt's artifact files: - node_type_pattern: - Use this filter to exclude and include node types using allow or deny method +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: "dbt" @@ -45,6 +55,8 @@ source: - ^.* ``` +## Config details + Note: when `load_schemas` is False, models that use [identifiers](https://docs.getdbt.com/reference/resource-properties/identifier) to reference their source tables are ingested using the model identifier as the model name to preserve the lineage. ## Questions diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/source_docs/druid.md index 7031e22fcf379..0643d84535054 100644 --- a/metadata-ingestion/source_docs/druid.md +++ b/metadata-ingestion/source_docs/druid.md @@ -1,7 +1,11 @@ # Druid +## Setup + To install this plugin, run `pip install 'acryl-datahub[druid]'`. +## Capabilities + This plugin extracts the following: - List of databases, schema, and tables @@ -9,6 +13,10 @@ This plugin extracts the following: **Note**: It is important to explicitly define the deny schema pattern for internal Druid databases (lookup & sys) if adding a schema pattern. Otherwise, the crawler may crash before processing relevant databases. This deny pattern is defined by default but is overriden by user-submitted configurations. +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: druid @@ -59,6 +67,8 @@ source: include_tables: True # whether to include views, defaults to True ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/feast.md b/metadata-ingestion/source_docs/feast.md index 48efed0443ddb..78ae0bedad32a 100644 --- a/metadata-ingestion/source_docs/feast.md +++ b/metadata-ingestion/source_docs/feast.md @@ -1,9 +1,13 @@ # Feast +## Setup + **Note: Feast ingestion requires Docker to be installed.** To install this plugin, run `pip install 'acryl-datahub[feast]'`. +## Capabilities + This plugin extracts the following: - List of feature tables (modeled as [`MLFeatureTable`](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureTableProperties.pdl)s), @@ -14,6 +18,10 @@ This plugin extracts the following: Note: this uses a separate Docker container to extract Feast's metadata into a JSON file, which is then parsed to DataHub's native objects. This separation was performed because of a dependency conflict in the `feast` module. +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: feast @@ -23,6 +31,8 @@ source: use_local_build: False # Whether to build Feast ingestion image locally, default is False ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/source_docs/file.md index 826d6cf55abf2..f7347d840d142 100644 --- a/metadata-ingestion/source_docs/file.md +++ b/metadata-ingestion/source_docs/file.md @@ -1,9 +1,19 @@ # File +## Setup + +Works with `acryl-datahub` out of the box. + +## Capabilities + This plugin pulls metadata from a previously generated file. The [file sink](../sink_docs/file.md) can produce such files, and a number of samples are included in the [examples/mce_files](../examples/mce_files) directory. +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: file @@ -11,6 +21,8 @@ source: filename: ./path/to/mce/file.json ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md index 3f7fadb63ae40..1112266db9470 100644 --- a/metadata-ingestion/source_docs/glue.md +++ b/metadata-ingestion/source_docs/glue.md @@ -1,9 +1,13 @@ # AWS Glue +## Setup + To install this plugin, run `pip install 'acryl-datahub[glue]'`. Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../s3-ingestion.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. +## Capabilities + This plugin extracts the following: - List of tables @@ -11,6 +15,10 @@ This plugin extracts the following: - Table metadata, such as owner, description and parameters - Jobs and their component transformations, data sources, and data sinks +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: glue @@ -45,6 +53,8 @@ source: # ... ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/source_docs/hive.md index 95df66613de52..f19df2743a225 100644 --- a/metadata-ingestion/source_docs/hive.md +++ b/metadata-ingestion/source_docs/hive.md @@ -1,13 +1,21 @@ # Hive +## Setup + To install this plugin, run `pip install 'acryl-datahub[hive]'`. +## Capabilities + This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table - Detailed table and storage information +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: hive @@ -86,6 +94,8 @@ source: +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/kafka-connect.md b/metadata-ingestion/source_docs/kafka-connect.md index 00f887aaa14d1..f82bf2a1747c9 100644 --- a/metadata-ingestion/source_docs/kafka-connect.md +++ b/metadata-ingestion/source_docs/kafka-connect.md @@ -1,4 +1,10 @@ -# Kafka Connect `kafka-connect` +# Kafka Connect + +## Setup + +To install this plugin, run `pip install 'acryl-datahub[kafka-connect]'`. + +## Capabilities This plugin extracts the following: @@ -6,6 +12,10 @@ This plugin extracts the following: - Creating individual `DataJobSnapshotClass` entity using `{connector_name}:{source_dataset}` naming - Lineage information between source database to Kafka topic +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: "kafka-connect" @@ -23,6 +33,8 @@ Current limitations: - Currently works only for Debezium source connectors. +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/source_docs/kafka.md index 1a46799664173..88efcc99d9d7a 100644 --- a/metadata-ingestion/source_docs/kafka.md +++ b/metadata-ingestion/source_docs/kafka.md @@ -1,12 +1,20 @@ # Kafka Metadata +## Setup + To install this plugin, run `pip install 'acryl-datahub[kafka]'`. +## Capabilities + This plugin extracts the following: - List of topics - from the Kafka broker - Schemas associated with each topic - from the schema registry +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: "kafka" @@ -33,6 +41,8 @@ source: producer_config: {} ``` +## Config details + The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively. For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). diff --git a/metadata-ingestion/source_docs/ldap.md b/metadata-ingestion/source_docs/ldap.md index aee334a67ac1b..0a4dfefe16a76 100644 --- a/metadata-ingestion/source_docs/ldap.md +++ b/metadata-ingestion/source_docs/ldap.md @@ -1,13 +1,21 @@ # LDAP +## Setup + To install this plugin, run `pip install 'acryl-datahub[ldap]'`. +## Capabilities + This plugin extracts the following: - List of people - Names, emails, titles, and manager information for each person - List of groups +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: "ldap" @@ -27,6 +35,8 @@ source: page_size: # default is 20 ``` +## Config details + The `drop_missing_first_last_name` should be set to true if you've got many "headless" user LDAP accounts for devices or services should be excluded when they do not contain a first and last name. This will only impact the ingestion of LDAP users, while LDAP groups will be unaffected by this config option. diff --git a/metadata-ingestion/source_docs/looker.md b/metadata-ingestion/source_docs/looker.md index 34e7c15410b2e..97d5e4184067b 100644 --- a/metadata-ingestion/source_docs/looker.md +++ b/metadata-ingestion/source_docs/looker.md @@ -1,7 +1,11 @@ # Looker dashboards +## Setup + To install this plugin, run `pip install 'acryl-datahub[looker]'`. +## Capabilities + This plugin extracts the following: - Looker dashboards and dashboard elements (charts) @@ -9,6 +13,10 @@ This plugin extracts the following: See the [Looker authentication docs](https://docs.looker.com/reference/api-and-integration/api-auth#authentication_with_an_sdk) for the steps to create a client ID and secret. +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: "looker" @@ -37,6 +45,8 @@ source: env: "PROD" # Optional, default is "PROD" ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/source_docs/lookml.md index 7af19441ad4a5..4b1aff7dfa34c 100644 --- a/metadata-ingestion/source_docs/lookml.md +++ b/metadata-ingestion/source_docs/lookml.md @@ -1,14 +1,22 @@ # LookML +## Setup + To install this plugin, run `pip install 'acryl-datahub[lookml]'`. Note! This plugin uses a package that requires Python 3.7+! +## Capabilities + This plugin extracts the following: - LookML views from model files - Name, upstream table names, dimensions, measures, and dimension groups +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: "lookml" @@ -37,6 +45,8 @@ source: parse_table_names_from_sql: False # see note below ``` +## Config details + Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-metadata/) to try to parse the tables the views depends on. As these SQL's can be complicated, and the package doesn't official support all the SQL dialects that Looker supports, the result might not be correct. This parsing is disabled by default, but can be enabled by setting diff --git a/metadata-ingestion/source_docs/mongodb.md b/metadata-ingestion/source_docs/mongodb.md index 13c901b509de6..d54257402eb41 100644 --- a/metadata-ingestion/source_docs/mongodb.md +++ b/metadata-ingestion/source_docs/mongodb.md @@ -1,7 +1,11 @@ # MongoDB +## Setup + To install this plugin, run `pip install 'acryl-datahub[mongodb]'`. +## Capabilities + This plugin extracts the following: - List of databases @@ -12,6 +16,10 @@ Moreover, setting `useRandomSampling: False` will sample the first documents fou Note that `schemaSamplingSize` has no effect if `enableSchemaInference: False` is set. +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: "mongodb" @@ -45,6 +53,8 @@ source: # ... ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/source_docs/mssql.md index b3480df136ef4..b9bd958556870 100644 --- a/metadata-ingestion/source_docs/mssql.md +++ b/metadata-ingestion/source_docs/mssql.md @@ -1,14 +1,22 @@ -# Microsoft SQL Server Metadata +# Microsoft SQL Server + +## Setup To install this plugin, run `pip install 'acryl-datahub[mssql]'`. We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. +## Capabilities + This plugin extracts the following: - List of databases, schema, tables and views - Column types associated with each table/view +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: mssql @@ -97,6 +105,8 @@ source: +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/source_docs/mysql.md index 31aeee8c7da45..28e846b97aba5 100644 --- a/metadata-ingestion/source_docs/mysql.md +++ b/metadata-ingestion/source_docs/mysql.md @@ -1,12 +1,20 @@ # MySQL +## Setup + To install this plugin, run `pip install 'acryl-datahub[mysql]'`. +## Capabilities + This plugin extracts the following: - List of databases and tables - Column types and schema associated with each table +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: mysql @@ -59,6 +67,8 @@ source: include_tables: True # whether to include views, defaults to True ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/oracle.md b/metadata-ingestion/source_docs/oracle.md index 2792ba477c106..ba081d7a356d5 100644 --- a/metadata-ingestion/source_docs/oracle.md +++ b/metadata-ingestion/source_docs/oracle.md @@ -1,7 +1,11 @@ # Oracle +## Setup + To install this plugin, run `pip install 'acryl-datahub[oracle]'`. +## Capabilities + This plugin extracts the following: - List of databases, schema, and tables @@ -9,6 +13,10 @@ This plugin extracts the following: Using the Oracle source requires that you've also installed the correct drivers; see the [cx_Oracle docs](https://cx-oracle.readthedocs.io/en/latest/user_guide/installation.html). The easiest one is the [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client.html). +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: oracle @@ -65,6 +73,8 @@ source: include_tables: True # whether to include views, defaults to True ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/postgres.md b/metadata-ingestion/source_docs/postgres.md index 605d08501f669..fa5c9be08a056 100644 --- a/metadata-ingestion/source_docs/postgres.md +++ b/metadata-ingestion/source_docs/postgres.md @@ -1,7 +1,11 @@ # PostgreSQL +## Setup + To install this plugin, run `pip install 'acryl-datahub[postgres]'`. +## Capabilities + This plugin extracts the following: - List of databases, schema, and tables @@ -9,6 +13,10 @@ This plugin extracts the following: - Also supports PostGIS extensions - database_alias (optional) can be used to change the name of database to be ingested +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: postgres @@ -62,6 +70,8 @@ source: include_tables: True # whether to include views, defaults to True ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md index 614818c78e641..75bc6ff990a9e 100644 --- a/metadata-ingestion/source_docs/redshift.md +++ b/metadata-ingestion/source_docs/redshift.md @@ -1,13 +1,21 @@ # Redshift +## Setup + To install this plugin, run `pip install 'acryl-datahub[redshift]'`. +## Capabilities + This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table - Also supports PostGIS extensions +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: redshift @@ -80,6 +88,8 @@ source: +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/sagemaker.md b/metadata-ingestion/source_docs/sagemaker.md index 3e1ec47419c05..315654e8044c7 100644 --- a/metadata-ingestion/source_docs/sagemaker.md +++ b/metadata-ingestion/source_docs/sagemaker.md @@ -1,12 +1,20 @@ -# AWS SageMaker +# SageMaker + +## Setup To install this plugin, run `pip install 'acryl-datahub[sagemaker]'`. +## Capabilities + This plugin extracts the following: - Feature groups - Models, jobs, and lineage between the two (e.g. when jobs output a model or a model is used by a job) +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: sagemaker @@ -34,6 +42,8 @@ source: transform: True ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/source_docs/snowflake.md index 1f309a05b1dea..e42c96ba2d574 100644 --- a/metadata-ingestion/source_docs/snowflake.md +++ b/metadata-ingestion/source_docs/snowflake.md @@ -1,12 +1,26 @@ # Snowflake +## Setup + To install this plugin, run `pip install 'acryl-datahub[snowflake]'`. +## Capabilities + This plugin extracts the following: - List of databases, schema, and tables - Column types associated with each table +:::tip + +You can also get fine-grained usage statistics for Snowflake using the `snowflake-usage` source described below. + +::: + +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: snowflake @@ -71,16 +85,14 @@ source: include_tables: True # whether to include views, defaults to True ``` -:::tip - -You can also get fine-grained usage statistics for Snowflake using the `snowflake-usage` source described below. - -::: - # Snowflake Usage Stats +## Setup + To install this plugin, run `pip install 'acryl-datahub[snowflake-usage]'`. +## Capabilities + This plugin extracts the following: - Fetch a list of queries issued @@ -91,6 +103,16 @@ Note: the user/role must have access to the account usage table. The "accountadm Note: the underlying access history views that we use are only available in Snowflake's enterprise edition or higher. +:::note + +This source only does usage statistics. To get the tables, views, and schemas in your Snowflake warehouse, ingest using the `snowflake` source described above. + +::: + +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: snowflake-usage @@ -108,11 +130,7 @@ source: top_n_queries: 10 # number of queries to save for each table ``` -:::note - -This source only does usage statistics. To get the tables, views, and schemas in your Snowflake warehouse, ingest using the `snowflake` source described above. - -::: +## Config details ## Questions diff --git a/metadata-ingestion/source_docs/sql_profiles.md b/metadata-ingestion/source_docs/sql_profiles.md index 08f7dbd49160e..cce4f8df14d35 100644 --- a/metadata-ingestion/source_docs/sql_profiles.md +++ b/metadata-ingestion/source_docs/sql_profiles.md @@ -1,10 +1,23 @@ # SQL Profiles +## Setup + To install this plugin, run `pip install 'acryl-datahub[sql-profiles]'`. The SQL-based profiler does not run alone, but rather can be enabled for other SQL-based sources. Enabling profiling will slow down ingestion runs. +:::caution + +Running profiling against many tables or over many rows can run up significant costs. +While we've done our best to limit the expensiveness of the queries the profiler runs, you +should be prudent about the set of tables profiling is enabled on or the frequency +of the profiling runs. + +::: + +## Capabilities + Extracts: - row and column counts for each table @@ -28,6 +41,10 @@ Supported SQL sources: - Snowflake - Generic SQLAlchemy source +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: # can be bigquery, snowflake, etc - see above for the list @@ -50,14 +67,7 @@ source: include_views: true ``` -:::caution - -Running profiling against many tables or over many rows can run up significant costs. -While we've done our best to limit the expensiveness of the queries the profiler runs, you -should be prudent about the set of tables profiling is enabled on or the frequency -of the profiling runs. - -::: +## Config details ## Questions diff --git a/metadata-ingestion/source_docs/sqlalchemy.md b/metadata-ingestion/source_docs/sqlalchemy.md index f7bdf1523fc67..c47a1907de2fe 100644 --- a/metadata-ingestion/source_docs/sqlalchemy.md +++ b/metadata-ingestion/source_docs/sqlalchemy.md @@ -1,4 +1,6 @@ -# Other databases using SQLAlchemy +# Other SQLAlchemy databases + +## Setup To install this plugin, run `pip install 'acryl-datahub[sqlalchemy]'`. @@ -6,11 +8,17 @@ The `sqlalchemy` source is useful if we don't have a pre-built source for your c database system, but there is an [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/14/dialects/) defined elsewhere. In order to use this, you must `pip install` the required dialect packages yourself. +## Capabilities + This plugin extracts the following: - List of schemas and tables - Column types associated with each table +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: sqlalchemy @@ -61,6 +69,8 @@ source: include_tables: True # whether to include views, defaults to True ``` +## Config details + ## Questions If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/superset.md b/metadata-ingestion/source_docs/superset.md index bc67ae5e67234..47c21001d3b91 100644 --- a/metadata-ingestion/source_docs/superset.md +++ b/metadata-ingestion/source_docs/superset.md @@ -1,11 +1,21 @@ # Superset +## Setup + To install this plugin, run `pip install 'acryl-datahub[superset]'`. +See documentation for superset's `/security/login` at https://superset.apache.org/docs/rest-api for more details on superset's login api. + +## Capabilities + This plugin extracts the following: - List of charts and dashboards +## Quickstart recipe + +Use the below recipe to get started with ingestion. See [below](#config-details) for full configuration options. + ```yml source: type: superset @@ -19,7 +29,7 @@ source: env: "PROD" # Optional, default is "PROD" ``` -See documentation for superset's `/security/login` at https://superset.apache.org/docs/rest-api for more details on superset's login api. +## Config details ## Questions From 8bf27a5e41da3f987c3612fd650c78e3b0355aa1 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 3 Aug 2021 17:38:07 -0400 Subject: [PATCH 20/33] Write generic sqlalchemy options --- metadata-ingestion/sink_docs/console.md | 5 ++ metadata-ingestion/sink_docs/datahub.md | 10 +++ metadata-ingestion/sink_docs/file.md | 5 ++ metadata-ingestion/source_docs/athena.md | 52 ++++++-------- metadata-ingestion/source_docs/bigquery.md | 66 ++++++----------- metadata-ingestion/source_docs/dbt.md | 35 +++++---- metadata-ingestion/source_docs/druid.md | 61 +++++----------- metadata-ingestion/source_docs/feast.md | 14 +++- metadata-ingestion/source_docs/file.md | 6 ++ metadata-ingestion/source_docs/glue.md | 35 +++++---- metadata-ingestion/source_docs/hive.md | 60 +++++----------- .../source_docs/kafka-connect.md | 20 +++--- metadata-ingestion/source_docs/kafka.md | 10 +++ metadata-ingestion/source_docs/ldap.md | 12 ++++ metadata-ingestion/source_docs/looker.md | 15 ++++ metadata-ingestion/source_docs/lookml.md | 14 ++++ metadata-ingestion/source_docs/mongodb.md | 18 +++++ metadata-ingestion/source_docs/mssql.md | 71 ++++++------------- metadata-ingestion/source_docs/mysql.md | 58 +++++---------- metadata-ingestion/source_docs/oracle.md | 62 ++++++---------- metadata-ingestion/source_docs/postgres.md | 62 ++++++---------- metadata-ingestion/source_docs/redshift.md | 50 +++++-------- metadata-ingestion/source_docs/sagemaker.md | 40 ++++++----- metadata-ingestion/source_docs/snowflake.md | 70 ++++++++---------- .../source_docs/sql_profiles.md | 23 +++--- metadata-ingestion/source_docs/sqlalchemy.md | 47 +++++------- metadata-ingestion/source_docs/superset.md | 12 ++++ 27 files changed, 417 insertions(+), 516 deletions(-) diff --git a/metadata-ingestion/sink_docs/console.md b/metadata-ingestion/sink_docs/console.md index edad962582533..f803ad94f764a 100644 --- a/metadata-ingestion/sink_docs/console.md +++ b/metadata-ingestion/sink_docs/console.md @@ -19,6 +19,11 @@ sink: ## Config details +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +| ----- | -------- | ------- | ----------- | + ## Questions If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md index a19488d43eae4..6ff40c97a37a9 100644 --- a/metadata-ingestion/sink_docs/datahub.md +++ b/metadata-ingestion/sink_docs/datahub.md @@ -24,6 +24,11 @@ sink: ### Config details +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +| ----- | -------- | ------- | ----------- | + ## DataHub Kafka ### Setup @@ -53,6 +58,11 @@ sink: ### Config details +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +| ----- | -------- | ------- | ----------- | + The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively. For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/file.md index 2e2f95ef37f9f..a678a60efe09a 100644 --- a/metadata-ingestion/sink_docs/file.md +++ b/metadata-ingestion/sink_docs/file.md @@ -23,6 +23,11 @@ sink: ## Config details +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +| ----- | -------- | ------- | ----------- | + ## Questions If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md index d41be109d8bbd..e77d0e15c8f1b 100644 --- a/metadata-ingestion/source_docs/athena.md +++ b/metadata-ingestion/source_docs/athena.md @@ -32,41 +32,31 @@ source: # However, the athena driver will transparently fetch these results as you would expect from any other sql client. work_group: athena_workgroup # "primary" - - # Tables to allow/deny. If left blank, will ingest all. - table_pattern: - deny: - # Note that the deny patterns take precedence over the allow patterns. - - "bad_table" - - "junk_table" - # Can also be a regular expression - - "(old|used|deprecated)_table" - allow: - - "good_table" - - "excellent_table" - - # Although the 'table_pattern' enables you to skip everything from certain schemas, - # having another option to allow/deny on schema level is an optimization for the case when there is a large number - # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter - # them out afterwards via the table_pattern. - - # If left blank, will ingest all. - schema_pattern: - deny: - # ... - allow: - # ... - - # Same format as table_pattern, used for filtering views. If left blank, will ingest all. - view_pattern: - deny: - # ... - allow: - # ... ``` ## Config details +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +| ---------------------- | -------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `username` | ❌ | Autodetected | Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html | +| `password` | ❌ | Autodetected | Same detection scheme as `username` | +| `database` | ❌ | Autodetected | | +| `aws_region` | ✅ | | | +| `s3_staging_dir` | ✅ | | Of format `"s3:///prefix/"`. The `s3_staging_dir` parameter is needed because Athena always writes query results to S3.
See https://docs.aws.amazon.com/athena/latest/ug/querying.html. | +| `work_group` | ✅ | | Name of Athena workgroup.
See https://docs.aws.amazon.com/athena/latest/ug/manage-queries-control-costs-with-workgroups.html. | +| `env` | ❌ | `"PROD"` | Environment to use in namespace when constructing URNs. | +| `options.