feat(docs): refactor source and sink ingestion docs (datahub-project#…

…3031)
rahulbsw · Aug 31, 2021 · 2f6c20d · 2f6c20d
1 parent 4453c09
commit 2f6c20d
Show file tree

Hide file tree

Showing 32 changed files with 2,047 additions and 906 deletions.
diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts
@@ -159,6 +159,13 @@ function markdown_guess_title(
  } else {
  // Find first h1 header and use it as the title.
  const headers = contents.content.match(/^# (.+)$/gm);
+
+ if (!headers) {
+ throw new Error(
+ `${filepath} must have at least one h1 header for setting the title`
+ );
+ }
+
  if (headers.length > 1 && contents.content.indexOf("```") < 0) {
  throw new Error(`too many h1 headers in ${filepath}`);
  }

diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
@@ -55,6 +55,14 @@ module.exports = {
  "docs/architecture/metadata-serving",
  //"docs/what/gms",
  ],
+ "Metadata Ingestion": [
+ {
+ Sources: list_ids_in_directory("metadata-ingestion/source_docs"),
+ },
+ {
+ Sinks: list_ids_in_directory("metadata-ingestion/sink_docs"),
+ },
+ ],
  "Metadata Modeling": [
  "docs/modeling/metadata-model",
  "docs/modeling/extending-the-metadata-model",

diff --git a/docs/features.md b/docs/features.md
@@ -40,7 +40,7 @@ Our open sourcing [blog post](https://engineering.linkedin.com/blog/2020/open-so
  - **Schema history**: view and diff historic versions of schemas
  - **GraphQL**: visualization of GraphQL schemas
 
-### Jos/flows [*coming soon*]
+### Jobs/flows [*coming soon*]
  - **Search**: full-text & advanced search, search ranking
  - **Browse**: browsing through a configurable hierarchy
  - **Basic information**: 

diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md
diff --git a/metadata-ingestion/examples/recipes/mongodb_to_datahub.yml b/metadata-ingestion/examples/recipes/mongodb_to_datahub.yml
@@ -13,7 +13,6 @@ source:
  collection_pattern: {}
  enableSchemaInference: True
  schemaSamplingSize: 1000
- # database_pattern/collection_pattern are similar to schema_pattern/table_pattern from above
 sink:
  type: "datahub-rest"
  config:

diff --git a/metadata-ingestion/sink_docs/console.md b/metadata-ingestion/sink_docs/console.md
@@ -0,0 +1,33 @@
+# Console
+
+For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
+
+## Setup
+
+Works with `acryl-datahub` out of the box.
+
+## Capabilities
+
+Simply prints each metadata event to stdout. Useful for experimentation and debugging purposes.
+
+## Quickstart recipe
+
+Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
+
+For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes).
+
+```yml
+source:
+ # source configs
+
+sink:
+ type: "console"
+```
+
+## Config details
+
+None!
+
+## Questions
+
+If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!
diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md
@@ -0,0 +1,87 @@
+# DataHub
+
+## DataHub Rest
+
+For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
+
+### Setup
+
+To install this plugin, run `pip install 'acryl-datahub[datahub-rest]'`.
+
+### Capabilities
+
+Pushes metadata to DataHub using the GMA rest API. The advantage of the rest-based interface
+is that any errors can immediately be reported.
+
+### Quickstart recipe
+
+Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
+
+For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes).
+
+```yml
+source:
+ # source configs
+sink:
+ type: "datahub-rest"
+ config:
+ server: "http:https://localhost:8080"
+```
+
+### Config details
+
+Note that a `.` is used to denote nested fields in the YAML recipe.
+
+| Field | Required | Default | Description |
+| -------- | -------- | ------- | ---------------------------- |
+| `server` | ✅ | | URL of DataHub GMS endpoint. |
+
+## DataHub Kafka
+
+For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
+
+### Setup
+
+To install this plugin, run `pip install 'acryl-datahub[datahub-kafka]'`.
+
+### Capabilities
+
+Pushes metadata to DataHub by publishing messages to Kafka. The advantage of the Kafka-based
+interface is that it's asynchronous and can handle higher throughput.
+
+### Quickstart recipe
+
+Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
+
+For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes).
+
+```yml
+source:
+ # source configs
+
+sink:
+ type: "datahub-kafka"
+ config:
+ connection:
+ bootstrap: "localhost:9092"
+ schema_registry_url: "http:https://localhost:8081"
+```
+
+### Config details
+
+Note that a `.` is used to denote nested fields in the YAML recipe.
+
+| Field | Required | Default | Description |
+| -------------------------------------------- | -------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `connection.bootstrap` | ✅ | | Kafka bootstrap URL. |
+| `connection.producer_config.<option>` | | | Passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.SerializingProducer |
+| `connection.schema_registry_url` | ✅ | | URL of schema registry being used. |
+| `connection.schema_registry_config.<option>` | | | Passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.schema_registry.SchemaRegistryClient |
+
+The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively.
+
+For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml).
+
+## Questions
+
+If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!
diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/file.md
@@ -0,0 +1,41 @@
+# File
+
+For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
+
+## Setup
+
+Works with `acryl-datahub` out of the box.
+
+## Capabilities
+
+Outputs metadata to a file. This can be used to decouple metadata sourcing from the
+process of pushing it into DataHub, and is particularly useful for debugging purposes.
+Note that the [file source](../source_docs/file.md) can read files generated by this sink.
+
+## Quickstart recipe
+
+Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
+
+For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes).
+
+```yml
+source:
+ # source configs
+
+sink:
+ type: file
+ config:
+ filename: ./path/to/mce/file.json
+```
+
+## Config details
+
+Note that a `.` is used to denote nested fields in the YAML recipe.
+
+| Field | Required | Default | Description |
+| -------- | -------- | ------- | ------------------------- |
+| filename | ✅ | | Path to file to write to. |
+
+## Questions
+
+If you've got any questions on configuring this sink, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!
diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/source_docs/athena.md
@@ -0,0 +1,70 @@
+# Athena
+
+For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
+
+## Setup
+
+To install this plugin, run `pip install 'acryl-datahub[athena]'`.
+
+## Capabilities
+
+This plugin extracts the following:
+
+- Metadata for databases, schemas, and tables
+- Column types associated with each table
+
+## Quickstart recipe
+
+Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
+
+For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes).
+
+```yml
+source:
+ type: athena
+ config:
+ # Coordinates
+ aws_region: my_aws_region_name
+ work_group: my_work_group
+
+ # Credentials
+ username: my_aws_access_key_id
+ password: my_aws_secret_access_key
+ database: my_database
+
+ # Options
+ s3_staging_dir: "s3:https://<bucket-name>/<folder>/"
+
+sink:
+ # sink configs
+```
+
+## Config details
+
+Note that a `.` is used to denote nested fields in the YAML recipe.
+
+| Field | Required | Default | Description |
+| ---------------------- | -------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `username` | | Autodetected | Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html |
+| `password` | | Autodetected | Same detection scheme as `username` |
+| `database` | | Autodetected | |
+| `aws_region` | ✅ | | AWS region code. |
+| `s3_staging_dir` | ✅ | | Of format `"s3:https://<bucket-name>/prefix/"`. The `s3_staging_dir` parameter is needed because Athena always writes query results to S3. <br />See https://docs.aws.amazon.com/athena/latest/ug/querying.html. |
+| `work_group` | ✅ | | Name of Athena workgroup. <br />See https://docs.aws.amazon.com/athena/latest/ug/manage-queries-control-costs-with-workgroups.html. |
+| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
+| `options.<option>` | | | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. |
+| `table_pattern.allow` | | | Regex pattern for tables to include in ingestion. |
+| `table_pattern.deny` | | | Regex pattern for tables to exclude from ingestion. |
+| `schema_pattern.allow` | | | Regex pattern for schemas to include in ingestion. |
+| `schema_pattern.deny` | | | Regex pattern for schemas to exclude from ingestion. |
+| `view_pattern.allow` | | | Regex pattern for views to include in ingestion. |
+| `view_pattern.deny` | | | Regex pattern for views to exclude from ingestion. |
+| `include_tables` | | `True` | Whether tables should be ingested. |
+
+## Compatibility
+
+Coming soon!
+
+## Questions
+
+If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!