diff --git a/.test-infra/jenkins/job_PreCommit_BeamSQL_ZetaSQL_Java11.groovy b/.test-infra/jenkins/job_PreCommit_BeamSQL_ZetaSQL_Java11.groovy new file mode 100644 index 0000000000000..50fd12411a15a --- /dev/null +++ b/.test-infra/jenkins/job_PreCommit_BeamSQL_ZetaSQL_Java11.groovy @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import PrecommitJobBuilder +import CommonJobProperties as properties + +PrecommitJobBuilder builder = new PrecommitJobBuilder( + scope: this, + nameBase: 'JavaBeamZetaSQLJava11', + gradleTask: ':javaPreCommitBeamZetaSQL', + gradleSwitches: [ + '-PdisableSpotlessCheck=true', + '-PcompileAndRunTestsWithJava11', + "-Pjava11Home=${CommonJobProperties.JAVA_11_HOME}" + ], // spotless checked in separate pre-commit + triggerPathPatterns: [ + '^sdks/java/extensions/sql/.*$', + ] +) +builder.build { + publishers { + archiveJunit('**/build/test-results/**/*.xml') + } +} diff --git a/CHANGES.md b/CHANGES.md index f1915fb79ba14..9aff55608c1d9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -60,7 +60,8 @@ ## New Features / Improvements -* --direct_num_workers=0 is supported for FnApi runner. It will set the number of threads/subprocesses to number of cores of the machine executing the pipeline ([BEAM-9443](https://issues.apache.org/jira/browse/BEAM-9443)). +* `--workerCacheMB` flag is supported in Dataflow streaming pipeline ([BEAM-9964](https://issues.apache.org/jira/browse/BEAM-9964)) +* `--direct_num_workers=0` is supported for FnApi runner. It will set the number of threads/subprocesses to number of cores of the machine executing the pipeline ([BEAM-9443](https://issues.apache.org/jira/browse/BEAM-9443)). * Python SDK now has experimental support for SqlTransform ([BEAM-8603](https://issues.apache.org/jira/browse/BEAM-8603)). ## Breaking Changes diff --git a/build.gradle b/build.gradle index 7fed2aa9308c8..848eef631ddf0 100644 --- a/build.gradle +++ b/build.gradle @@ -350,6 +350,7 @@ if (project.hasProperty('javaLinkageArtifactIds')) { if (project.hasProperty('compileAndRunTestsWithJava11')) { project.javaPreCommitPortabilityApi.dependsOn ':sdks:java:testing:test-utils:verifyJavaVersion' project.javaExamplesDataflowPrecommit.dependsOn ':sdks:java:testing:test-utils:verifyJavaVersion' + project.javaPreCommitBeamZetaSQL.dependsOn ':sdks:java:testing:test-utils:verifyJavaVersion' } else { allprojects { tasks.withType(Test) { diff --git a/learning/katas/java/Common Transforms/Aggregation/Count/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Count/task-remote-info.yaml index 75d8467d87c7a..82be05c692809 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Count/task-remote-info.yaml +++ b/learning/katas/java/Common Transforms/Aggregation/Count/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076163 -update_date: Fri, 07 Feb 2020 14:07:39 UTC +update_date: Tue, 19 May 2020 07:02:13 UTC diff --git a/learning/katas/java/Common Transforms/Aggregation/Count/task.html b/learning/katas/java/Common Transforms/Aggregation/Count/task.md similarity index 89% rename from learning/katas/java/Common Transforms/Aggregation/Count/task.html rename to learning/katas/java/Common Transforms/Aggregation/Count/task.md index 41fd3be505c94..1c4eba0e75a00 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Count/task.html +++ b/learning/katas/java/Common Transforms/Aggregation/Count/task.md @@ -16,14 +16,12 @@ ~ limitations under the License. --> - -

Aggregation - Count

-

- Kata: Count the number of elements from an input. -

-
+Aggregation - Count +------------------- + +**Kata:** Count the number of elements from an input. +
Use Count.
- diff --git a/learning/katas/java/Common Transforms/Aggregation/Max/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Max/task-remote-info.yaml index 397180092dac8..66a3816db9f11 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Max/task-remote-info.yaml +++ b/learning/katas/java/Common Transforms/Aggregation/Max/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076167 -update_date: Fri, 07 Feb 2020 14:07:47 UTC +update_date: Tue, 19 May 2020 07:02:25 UTC diff --git a/learning/katas/java/Common Transforms/Aggregation/Max/task.html b/learning/katas/java/Common Transforms/Aggregation/Max/task.md similarity index 88% rename from learning/katas/java/Common Transforms/Aggregation/Max/task.html rename to learning/katas/java/Common Transforms/Aggregation/Max/task.md index 194adc5e7ddd4..541a380efd6c0 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Max/task.html +++ b/learning/katas/java/Common Transforms/Aggregation/Max/task.md @@ -16,14 +16,12 @@ ~ limitations under the License. --> - -

Aggregation - Max

-

- Kata: Compute the maximum of the elements from an input. -

-
+Aggregation - Max +----------------- + +**Kata:** Compute the maximum of the elements from an input. +
Use Max.
- diff --git a/learning/katas/java/Common Transforms/Aggregation/Mean/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Mean/task-remote-info.yaml index 3a359d93fa3c6..819f70213b0f7 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Mean/task-remote-info.yaml +++ b/learning/katas/java/Common Transforms/Aggregation/Mean/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076165 -update_date: Fri, 07 Feb 2020 14:07:43 UTC +update_date: Tue, 19 May 2020 07:02:19 UTC diff --git a/learning/katas/java/Common Transforms/Aggregation/Mean/task.html b/learning/katas/java/Common Transforms/Aggregation/Mean/task.md similarity index 88% rename from learning/katas/java/Common Transforms/Aggregation/Mean/task.html rename to learning/katas/java/Common Transforms/Aggregation/Mean/task.md index 8934abf961f73..b08fd97a416a8 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Mean/task.html +++ b/learning/katas/java/Common Transforms/Aggregation/Mean/task.md @@ -16,14 +16,12 @@ ~ limitations under the License. --> - -

Aggregation - Mean

-

- Kata: Compute the mean/average of all elements from an input. -

-
+Aggregation - Mean +------------------ + +**Kata:** Compute the mean/average of all elements from an input. +
Use Mean.
- diff --git a/learning/katas/java/Common Transforms/Aggregation/Min/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Min/task-remote-info.yaml index dd2bb21e4a03c..5c1eb0c485c9c 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Min/task-remote-info.yaml +++ b/learning/katas/java/Common Transforms/Aggregation/Min/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076166 -update_date: Fri, 07 Feb 2020 14:07:45 UTC +update_date: Tue, 19 May 2020 07:02:22 UTC diff --git a/learning/katas/java/Common Transforms/Aggregation/Min/task.html b/learning/katas/java/Common Transforms/Aggregation/Min/task.md similarity index 88% rename from learning/katas/java/Common Transforms/Aggregation/Min/task.html rename to learning/katas/java/Common Transforms/Aggregation/Min/task.md index 157a7e1315536..da5b31905fca3 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Min/task.html +++ b/learning/katas/java/Common Transforms/Aggregation/Min/task.md @@ -16,14 +16,12 @@ ~ limitations under the License. --> - -

Aggregation - Min

-

- Kata: Compute the minimum of the elements from an input. -

-
+Aggregation - Min +----------------- + +**Kata:** Compute the minimum of the elements from an input. +
Use Min.
- diff --git a/learning/katas/java/Common Transforms/Aggregation/Sum/task-remote-info.yaml b/learning/katas/java/Common Transforms/Aggregation/Sum/task-remote-info.yaml index c19114437d626..b1403ab246584 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Sum/task-remote-info.yaml +++ b/learning/katas/java/Common Transforms/Aggregation/Sum/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076164 -update_date: Fri, 07 Feb 2020 14:07:41 UTC +update_date: Tue, 19 May 2020 07:02:16 UTC diff --git a/learning/katas/java/Common Transforms/Aggregation/Sum/task.html b/learning/katas/java/Common Transforms/Aggregation/Sum/task.md similarity index 89% rename from learning/katas/java/Common Transforms/Aggregation/Sum/task.html rename to learning/katas/java/Common Transforms/Aggregation/Sum/task.md index 82511e7b95039..2031bbad5565f 100644 --- a/learning/katas/java/Common Transforms/Aggregation/Sum/task.html +++ b/learning/katas/java/Common Transforms/Aggregation/Sum/task.md @@ -16,14 +16,12 @@ ~ limitations under the License. --> - -

Aggregation - Sum

-

- Kata: Compute the sum of all elements from an input. -

-
+Aggregation - Sum +----------------- + +**Kata:** Compute the sum of all elements from an input. +
Use Sum.
- diff --git a/learning/katas/java/Common Transforms/Filter/Filter/task-remote-info.yaml b/learning/katas/java/Common Transforms/Filter/Filter/task-remote-info.yaml index 5ec1ba0314b90..6033731695262 100644 --- a/learning/katas/java/Common Transforms/Filter/Filter/task-remote-info.yaml +++ b/learning/katas/java/Common Transforms/Filter/Filter/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076162 -update_date: Fri, 07 Feb 2020 14:07:31 UTC +update_date: Tue, 19 May 2020 07:02:10 UTC diff --git a/learning/katas/java/Common Transforms/Filter/Filter/task.html b/learning/katas/java/Common Transforms/Filter/Filter/task.md similarity index 73% rename from learning/katas/java/Common Transforms/Filter/Filter/task.html rename to learning/katas/java/Common Transforms/Filter/Filter/task.md index 15eb012ffa40d..e499074e236f9 100644 --- a/learning/katas/java/Common Transforms/Filter/Filter/task.html +++ b/learning/katas/java/Common Transforms/Filter/Filter/task.md @@ -16,19 +16,15 @@ ~ limitations under the License. --> - -

Filter

-

- The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. -

-

- Kata: Implement a filter function that filters out the odd numbers by using - - Filter. -

-
+Filter +------ + +The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. + +**Kata:** Implement a filter function that filters out the odd numbers by using +[Filter](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Filter.html). +
Use Filter.by(...).
- diff --git a/learning/katas/java/Common Transforms/Filter/ParDo/task-remote-info.yaml b/learning/katas/java/Common Transforms/Filter/ParDo/task-remote-info.yaml index f602ccca515b2..e84c2bb16a3a9 100644 --- a/learning/katas/java/Common Transforms/Filter/ParDo/task-remote-info.yaml +++ b/learning/katas/java/Common Transforms/Filter/ParDo/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076161 -update_date: Fri, 07 Feb 2020 14:07:30 UTC +update_date: Tue, 19 May 2020 07:02:07 UTC diff --git a/learning/katas/java/Common Transforms/Filter/ParDo/task.html b/learning/katas/java/Common Transforms/Filter/ParDo/task.md similarity index 81% rename from learning/katas/java/Common Transforms/Filter/ParDo/task.html rename to learning/katas/java/Common Transforms/Filter/ParDo/task.md index 61adb7088bb36..be21a28143fc6 100644 --- a/learning/katas/java/Common Transforms/Filter/ParDo/task.html +++ b/learning/katas/java/Common Transforms/Filter/ParDo/task.md @@ -16,18 +16,15 @@ ~ limitations under the License. --> - -

Filter using ParDo

-

- Kata: Implement a filter function that filters out the even numbers by using - - DoFn. -

-
+Filter using ParDo +------------------ + +**Kata:** Implement a filter function that filters out the even numbers by using +[DoFn](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/DoFn.html). +
Use ParDo with DoFn and only output the intended element.
- diff --git a/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml b/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml index 90eb8aae3f2dc..870bb130cb9df 100644 --- a/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml +++ b/learning/katas/java/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076168 -update_date: Fri, 07 Feb 2020 14:07:53 UTC +update_date: Tue, 19 May 2020 07:02:28 UTC diff --git a/learning/katas/java/Common Transforms/WithKeys/WithKeys/task.html b/learning/katas/java/Common Transforms/WithKeys/WithKeys/task.md similarity index 86% rename from learning/katas/java/Common Transforms/WithKeys/WithKeys/task.html rename to learning/katas/java/Common Transforms/WithKeys/WithKeys/task.md index e95e56e44f862..15738d6f3e3b4 100644 --- a/learning/katas/java/Common Transforms/WithKeys/WithKeys/task.html +++ b/learning/katas/java/Common Transforms/WithKeys/WithKeys/task.md @@ -16,19 +16,18 @@ ~ limitations under the License. --> - -

WithKeys

-

- Kata: Convert each fruit name into a KV of its first letter and itself, e.g. - apple => KV.of("a", "apple") -

-
+WithKeys +-------- + +**Kata:** Convert each fruit name into a KV of its first letter and itself, e.g. +`apple => KV.of("a", "apple")` +
Use WithKeys.
+
If using a lambda in Java 8, withKeyType(TypeDescriptor) must be called on the result PTransform.
- diff --git a/learning/katas/java/Core Transforms/Branching/Branching/task-remote-info.yaml b/learning/katas/java/Core Transforms/Branching/Branching/task-remote-info.yaml index 40576c0da4c77..5e3927a5d3754 100644 --- a/learning/katas/java/Core Transforms/Branching/Branching/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Branching/Branching/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076188 -update_date: Fri, 07 Feb 2020 14:28:13 UTC +update_date: Tue, 19 May 2020 07:01:56 UTC diff --git a/learning/katas/java/Core Transforms/Branching/Branching/task.html b/learning/katas/java/Core Transforms/Branching/Branching/task.md similarity index 76% rename from learning/katas/java/Core Transforms/Branching/Branching/task.html rename to learning/katas/java/Core Transforms/Branching/Branching/task.md index 12d9645aa03a5..3677fa9032225 100644 --- a/learning/katas/java/Core Transforms/Branching/Branching/task.html +++ b/learning/katas/java/Core Transforms/Branching/Branching/task.md @@ -16,20 +16,17 @@ ~ limitations under the License. --> - -

Branching

-

- You can use the same PCollection as input for multiple transforms without consuming the input - or altering it. -

-

- Kata: Branch out the numbers to two different transforms: one transform is multiplying - each number by 5 and the other transform is multiplying each number by 10. -

-
+Branching +--------- + +You can use the same PCollection as input for multiple transforms without consuming the input or +altering it. + +**Kata:** Branch out the numbers to two different transforms: one transform is multiplying each +number by 5 and the other transform is multiplying each number by 10. +
Refer to the Beam Design Your Pipeline Guide "Multiple transforms process the same PCollection" section for more information.
- diff --git a/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml b/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml index 3e29e7a59f164..31dff911f354e 100644 --- a/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076151 -update_date: Fri, 07 Feb 2020 14:06:21 UTC +update_date: Tue, 19 May 2020 07:08:21 UTC diff --git a/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task.html b/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task.md similarity index 75% rename from learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task.html rename to learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task.md index 29f5322b0f28c..8b24f7abee27a 100644 --- a/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task.html +++ b/learning/katas/java/Core Transforms/CoGroupByKey/CoGroupByKey/task.md @@ -16,19 +16,17 @@ ~ limitations under the License. --> - -

CoGroupByKey

-

- CoGroupByKey performs a relational join of two or more key/value PCollections that have the same - key type. -

-

- Kata: Implement a - - CoGroupByKey transform that join words by its first alphabetical letter, and then produces - the toString() representation of the WordsAlphabet model. -

-
+CoGroupByKey +------------ + +CoGroupByKey performs a relational join of two or more key/value PCollections that have the same +key type. + +**Kata:** Implement a +[CoGroupByKey](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/join/CoGroupByKey.html) +transform that join words by its first alphabetical letter, and then produces the toString() +representation of the WordsAlphabet model. +
Refer to CoGroupByKey, @@ -37,9 +35,9 @@

CoGroupByKey

CoGbkResult.
+
Refer to the Beam Programming Guide "CoGroupByKey" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-remote-info.yaml index 4e5e0f61f0f46..7dd4a961b2815 100644 --- a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076154 -update_date: Fri, 07 Feb 2020 14:34:40 UTC +update_date: Tue, 19 May 2020 07:01:38 UTC diff --git a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task.html b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task.md similarity index 75% rename from learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task.html rename to learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task.md index ccafa6219ae90..45a3e4d0582a7 100644 --- a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task.html +++ b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn Lambda/task.md @@ -16,28 +16,25 @@ ~ limitations under the License. --> - -

Combine - BinaryCombineFn Lambda

-

- BinaryCombineFn is used for implementing combiners that are more easily expressed as binary - operations. -

-

- Since Beam v2.13.0, you can also use lambda or method reference in order to create the - BinaryCombineFn. -

-

- Kata: Implement the summation of BigInteger using lambda or method reference. -

-
+Combine - BinaryCombineFn Lambda +-------------------------------- + +BinaryCombineFn is used for implementing combiners that are more easily expressed as binary +operations. + +Since Beam v2.13.0, you can also use lambda or method reference in order to create the +BinaryCombineFn. + +**Kata:** Implement the summation of BigInteger using lambda or method reference. +
Refer to SerializableBiFunction.
+
Refer to the Beam Programming Guide "Combine" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-remote-info.yaml index cec908ff11360..605c7c0153bf2 100644 --- a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076153 -update_date: Fri, 07 Feb 2020 14:34:37 UTC +update_date: Tue, 19 May 2020 07:01:35 UTC diff --git a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task.html b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task.md similarity index 55% rename from learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task.html rename to learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task.md index c18d3ac460033..ea026b1a9ac04 100644 --- a/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task.html +++ b/learning/katas/java/Core Transforms/Combine/BinaryCombineFn/task.md @@ -16,35 +16,30 @@ ~ limitations under the License. --> - -

Combine - BinaryCombineFn

-

- Combine is a Beam transform for combining collections of elements or values in your data. - When you apply a Combine transform, you must provide the function that contains the logic for - combining the elements or values. The combining function should be commutative and associative, - as the function is not necessarily invoked exactly once on all values with a given key. Because - the input data (including the value collection) may be distributed across multiple workers, the - combining function might be called multiple times to perform partial combining on subsets of - the value collection. -

-

- BinaryCombineFn is used for implementing combiners that are more easily expressed as binary - operations. -

-

- Kata: Implement the summation of BigInteger using - - Combine.BinaryCombineFn. -

-
+Combine - BinaryCombineFn +------------------------- + +Combine is a Beam transform for combining collections of elements or values in your data. When you +apply a Combine transform, you must provide the function that contains the logic for combining the +elements or values. The combining function should be commutative and associative, as the function +is not necessarily invoked exactly once on all values with a given key. Because the input data +(including the value collection) may be distributed across multiple workers, the combining function +might be called multiple times to perform partial combining on subsets of the value collection. + +BinaryCombineFn is used for implementing combiners that are more easily expressed as binary +operations. + +**Kata:** Implement the summation of BigInteger using +[Combine.BinaryCombineFn](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.BinaryCombineFn.html). +
Extend the Combine.BinaryCombineFn class that counts the sum of the number.
+
Refer to the Beam Programming Guide "Combine" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml index e58c6a94ce470..ae7879c02da79 100644 --- a/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076155 -update_date: Fri, 07 Feb 2020 14:34:43 UTC +update_date: Tue, 19 May 2020 07:01:41 UTC diff --git a/learning/katas/java/Core Transforms/Combine/Combine PerKey/task.html b/learning/katas/java/Core Transforms/Combine/Combine PerKey/task.md similarity index 67% rename from learning/katas/java/Core Transforms/Combine/Combine PerKey/task.html rename to learning/katas/java/Core Transforms/Combine/Combine PerKey/task.md index 62b6afb0c1b07..3f80616714d2d 100644 --- a/learning/katas/java/Core Transforms/Combine/Combine PerKey/task.html +++ b/learning/katas/java/Core Transforms/Combine/Combine PerKey/task.md @@ -16,33 +16,31 @@ ~ limitations under the License. --> - -

Combine - Combine PerKey

-

- After creating a keyed PCollection (for example, by using a GroupByKey transform), a common - pattern is to combine the collection of values associated with each key into a single, merged - value. This pattern of a GroupByKey followed by merging the collection of values is equivalent to - Combine PerKey transform. The combine function you supply to Combine PerKey must be an associative - reduction function or a subclass of CombineFn. -

-

- Kata: Implement the sum of scores per player using - - Combine.perKey. -

-
+Combine - Combine PerKey +------------------------ + +After creating a keyed PCollection (for example, by using a GroupByKey transform), a common pattern +is to combine the collection of values associated with each key into a single, merged value. This +pattern of a GroupByKey followed by merging the collection of values is equivalent to Combine +PerKey transform. The combine function you supply to Combine PerKey must be an associative +reduction function or a subclass of CombineFn. + +**Kata:** Implement the sum of scores per player using +[Combine.perKey](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/CombineFnBase.GlobalCombineFn.html). +
Use Combine.perKey(GlobalCombineFn).
+
Extend the Combine.BinaryCombineFn class that counts the sum of the number.
+
Refer to the Beam Programming Guide "Combining values in a keyed PCollection" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Combine/CombineFn/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/CombineFn/task-remote-info.yaml index 6d028949442a7..195a816d65590 100644 --- a/learning/katas/java/Core Transforms/Combine/CombineFn/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Combine/CombineFn/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076152 -update_date: Fri, 07 Feb 2020 14:34:34 UTC +update_date: Tue, 19 May 2020 07:01:32 UTC diff --git a/learning/katas/java/Core Transforms/Combine/CombineFn/task.html b/learning/katas/java/Core Transforms/Combine/CombineFn/task.md similarity index 51% rename from learning/katas/java/Core Transforms/Combine/CombineFn/task.html rename to learning/katas/java/Core Transforms/Combine/CombineFn/task.md index 94b6be384dd91..13bce44ecbf1a 100644 --- a/learning/katas/java/Core Transforms/Combine/CombineFn/task.html +++ b/learning/katas/java/Core Transforms/Combine/CombineFn/task.md @@ -16,37 +16,32 @@ ~ limitations under the License. --> - -

Combine - CombineFn

-

- Combine is a Beam transform for combining collections of elements or values in your data. - When you apply a Combine transform, you must provide the function that contains the logic for - combining the elements or values. The combining function should be commutative and associative, - as the function is not necessarily invoked exactly once on all values with a given key. Because - the input data (including the value collection) may be distributed across multiple workers, the - combining function might be called multiple times to perform partial combining on subsets of - the value collection. -

-

- Complex combination operations might require you to create a subclass of CombineFn that has an - accumulation type distinct from the input/output type. You should use CombineFn if the combine - function requires a more sophisticated accumulator, must perform additional pre- or - post-processing, might change the output type, or takes the key into account. -

-

- Kata: Implement the average of numbers using - - Combine.CombineFn. -

-
+Combine - CombineFn +------------------- + +Combine is a Beam transform for combining collections of elements or values in your data. When you +apply a Combine transform, you must provide the function that contains the logic for combining the +elements or values. The combining function should be commutative and associative, as the function +is not necessarily invoked exactly once on all values with a given key. Because the input data +(including the value collection) may be distributed across multiple workers, the combining function +might be called multiple times to perform partial combining on subsets of the value collection. + +Complex combination operations might require you to create a subclass of CombineFn that has an +accumulation type distinct from the input/output type. You should use CombineFn if the combine +function requires a more sophisticated accumulator, must perform additional pre- or post-processing, +might change the output type, or takes the key into account. + +**Kata:** Implement the average of numbers using +[Combine.CombineFn](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Combine.CombineFn.html). +
Extend the Combine.CombineFn class that counts the average of the number.
+
Refer to the Beam Programming Guide "Advanced combinations using CombineFn" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Combine/Simple Function/task-remote-info.yaml b/learning/katas/java/Core Transforms/Combine/Simple Function/task-remote-info.yaml index 6bf9717cbf64f..cc9a7560915df 100644 --- a/learning/katas/java/Core Transforms/Combine/Simple Function/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Combine/Simple Function/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076194 -update_date: Fri, 07 Feb 2020 14:34:33 UTC +update_date: Tue, 19 May 2020 07:07:04 UTC diff --git a/learning/katas/java/Core Transforms/Combine/Simple Function/task.html b/learning/katas/java/Core Transforms/Combine/Simple Function/task.md similarity index 57% rename from learning/katas/java/Core Transforms/Combine/Simple Function/task.html rename to learning/katas/java/Core Transforms/Combine/Simple Function/task.md index d501be8fcc2e5..b20fb9d4749ba 100644 --- a/learning/katas/java/Core Transforms/Combine/Simple Function/task.html +++ b/learning/katas/java/Core Transforms/Combine/Simple Function/task.md @@ -16,34 +16,29 @@ ~ limitations under the License. --> - -

Combine - Simple Function

-

- Combine is a Beam transform for combining collections of elements or values in your data. - When you apply a Combine transform, you must provide the function that contains the logic for - combining the elements or values. The combining function should be commutative and associative, - as the function is not necessarily invoked exactly once on all values with a given key. Because - the input data (including the value collection) may be distributed across multiple workers, the - combining function might be called multiple times to perform partial combining on subsets of - the value collection. -

-

- Simple combine operations, such as sums, can usually be implemented as a simple function. -

-

- Kata: Implement the summation of numbers using - - Combine.globally(SerializableFunction). -

-
+Combine - Simple Function +------------------------- + +Combine is a Beam transform for combining collections of elements or values in your data. When you +apply a Combine transform, you must provide the function that contains the logic for combining the +elements or values. The combining function should be commutative and associative, as the function +is not necessarily invoked exactly once on all values with a given key. Because the input data +(including the value collection) may be distributed across multiple workers, the combining function +might be called multiple times to perform partial combining on subsets of the value collection. + +Simple combine operations, such as sums, can usually be implemented as a simple function. + +**Kata:** Implement the summation of numbers using +[Combine.globally(SerializableFunction)](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/SerializableFunction.html). +
Implement the SerializableFunction.apply method that performs the summation of the Iterable.
+
Refer to the Beam Programming Guide "Simple combinations using simple functions" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml b/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml index be0275aa8cf23..b0e1654834708 100644 --- a/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076159 -update_date: Fri, 07 Feb 2020 14:07:15 UTC +update_date: Tue, 19 May 2020 07:01:59 UTC diff --git a/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task.html b/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task.html deleted file mode 100644 index 52c0f2460f2cf..0000000000000 --- a/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task.html +++ /dev/null @@ -1,51 +0,0 @@ - - - -

Composite Transform

-

- Transforms can have a nested structure, where a complex transform performs multiple simpler - transforms (such as more than one ParDo, Combine, GroupByKey, or even other composite transforms). - These transforms are called composite transforms. Nesting multiple transforms inside a single - composite transform can make your code more modular and easier to understand. -

-

- To create your own composite transform, create a subclass of the PTransform class and override - the expand method to specify the actual processing logic. You can then use this transform just as - you would a built-in transform from the Beam SDK. For the PTransform class type parameters, you - pass the PCollection types that your transform takes as input, and produces as output. Within - your PTransform subclass, you’ll need to override the expand method. The expand method is where - you add the processing logic for the PTransform. Your override of expand must accept the - appropriate type of input PCollection as a parameter, and specify the output PCollection as the - return value. -

-

- Kata: Please implement a composite transform "ExtractAndMultiplyNumbers" that extracts - numbers from comma separated line and then multiplies each number by 10. -

-
-
- Refer to - PTransform. -
-
- Refer to the Beam Programming Guide - - "Composite transforms" section for more information. -
- diff --git a/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task.md b/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task.md new file mode 100644 index 0000000000000..45854bfe95f31 --- /dev/null +++ b/learning/katas/java/Core Transforms/Composite Transform/Composite Transform/task.md @@ -0,0 +1,47 @@ + + +Composite Transform +------------------- + +Transforms can have a nested structure, where a complex transform performs multiple simpler +transforms (such as more than one ParDo, Combine, GroupByKey, or even other composite transforms). +These transforms are called composite transforms. Nesting multiple transforms inside a single +composite transform can make your code more modular and easier to understand. + +To create your own composite transform, create a subclass of the PTransform class and override the +expand method to specify the actual processing logic. You can then use this transform just as you +would a built-in transform from the Beam SDK. For the PTransform class type parameters, you pass +the PCollection types that your transform takes as input, and produces as output. Within your +PTransform subclass, you’ll need to override the expand method. The expand method is where you add +the processing logic for the PTransform. Your override of expand must accept the appropriate type +of input PCollection as a parameter, and specify the output PCollection as the return value. + +**Kata:** Please implement a composite transform "ExtractAndMultiplyNumbers" that extracts numbers +from comma separated line and then multiplies each number by 10. + +
+ Refer to + PTransform. +
+ +
+ Refer to the Beam Programming Guide + + "Composite transforms" section for more information. +
diff --git a/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-remote-info.yaml b/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-remote-info.yaml index e319aef322224..086f86a19212f 100644 --- a/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076160 -update_date: Fri, 07 Feb 2020 14:07:21 UTC +update_date: Tue, 19 May 2020 07:02:03 UTC diff --git a/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task.html b/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task.html deleted file mode 100644 index c6e38b0473568..0000000000000 --- a/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task.html +++ /dev/null @@ -1,52 +0,0 @@ - - - -

DoFn Additional Parameters

-

- In addition to the element and the OutputReceiver, Beam will populate other parameters to your - DoFn’s @ProcessElement method. Any combination of these parameters can be added to your process - method in any order. -

-
- -
-

- Refer to the Beam Programming Guide - - "Accessing additional parameters in your DoFn" section for more information. -

- diff --git a/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task.md b/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task.md new file mode 100644 index 0000000000000..32657dcb8635e --- /dev/null +++ b/learning/katas/java/Core Transforms/DoFn Additional Parameters/DoFn Additional Parameters/task.md @@ -0,0 +1,38 @@ + + +DoFn Additional Parameters +-------------------------- + +In addition to the element and the OutputReceiver, Beam will populate other parameters to your +DoFn’s @ProcessElement method. Any combination of these parameters can be added to your process +method in any order. + +* **Timestamp**: To access the timestamp of an input element, add a parameter annotated with +@Timestamp of type Instant +* **Window**: To access the window an input element falls into, add a parameter of the type of +the window used for the input PCollection. +* **PaneInfo**: When triggers are used, Beam provides a PaneInfo object that contains information +about the current firing. Using PaneInfo you can determine whether this is an early or a late +firing, and how many times this window has already fired for this key. +* **PipelineOptions**: The PipelineOptions for the current pipeline can always be accessed in a +process method by adding it as a parameter. + +Refer to the Beam Programming Guide +["Accessing additional parameters in your DoFn"](https://beam.apache.org/documentation/programming-guide/#other-dofn-parameters) +section for more information. diff --git a/learning/katas/java/Core Transforms/Flatten/Flatten/task-remote-info.yaml b/learning/katas/java/Core Transforms/Flatten/Flatten/task-remote-info.yaml index 7da7fb33fe672..f82552000d1b0 100644 --- a/learning/katas/java/Core Transforms/Flatten/Flatten/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Flatten/Flatten/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076187 -update_date: Fri, 07 Feb 2020 14:28:11 UTC +update_date: Tue, 19 May 2020 07:01:44 UTC diff --git a/learning/katas/java/Core Transforms/Flatten/Flatten/task.html b/learning/katas/java/Core Transforms/Flatten/Flatten/task.md similarity index 73% rename from learning/katas/java/Core Transforms/Flatten/Flatten/task.html rename to learning/katas/java/Core Transforms/Flatten/Flatten/task.md index f0bf35e9c8048..5bf19ba5e6328 100644 --- a/learning/katas/java/Core Transforms/Flatten/Flatten/task.html +++ b/learning/katas/java/Core Transforms/Flatten/Flatten/task.md @@ -16,26 +16,24 @@ ~ limitations under the License. --> - -

Flatten

-

- Flatten is a Beam transform for PCollection objects that store the same data type. - Flatten merges multiple PCollection objects into a single logical PCollection. -

-

- Kata: Implement a - - Flatten transform that merges two PCollection of words into a single PCollection. -

-
+Flatten +------- + +Flatten is a Beam transform for PCollection objects that store the same data type. Flatten merges +multiple PCollection objects into a single logical PCollection. + +**Kata:** Implement a +[Flatten](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Flatten.html) +transform that merges two PCollection of words into a single PCollection. +
Refer to Flatten to solve this problem.
+
Refer to the Beam Programming Guide "Flatten" section for more information.
- diff --git a/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml b/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml index c8fad3b294c05..803643efd658c 100644 --- a/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076186 -update_date: Fri, 07 Feb 2020 14:28:07 UTC +update_date: Tue, 19 May 2020 07:01:26 UTC diff --git a/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task.html b/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task.md similarity index 64% rename from learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task.html rename to learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task.md index 54082b091bf27..cfaa2eb632ab5 100644 --- a/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task.html +++ b/learning/katas/java/Core Transforms/GroupByKey/GroupByKey/task.md @@ -16,30 +16,28 @@ ~ limitations under the License. --> - -

GroupByKey

-

- GroupByKey is a Beam transform for processing collections of key/value pairs. It’s a parallel - reduction operation, analogous to the Shuffle phase of a Map/Shuffle/Reduce-style algorithm. The - input to GroupByKey is a collection of key/value pairs that represents a multimap, where the - collection contains multiple pairs that have the same key, but different values. Given such a - collection, you use GroupByKey to collect all of the values associated with each unique key. -

-

- Kata: Implement a - - GroupByKey transform that groups words by its first letter. -

-
+GroupByKey +---------- + +GroupByKey is a Beam transform for processing collections of key/value pairs. It’s a parallel +reduction operation, analogous to the Shuffle phase of a Map/Shuffle/Reduce-style algorithm. The +input to GroupByKey is a collection of key/value pairs that represents a multimap, where the +collection contains multiple pairs that have the same key, but different values. Given such a +collection, you use GroupByKey to collect all of the values associated with each unique key. + +**Kata:** Implement a +[GroupByKey](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/GroupByKey.html) +transform that groups words by its first letter. +
Refer to KV and GroupByKey to solve this problem.
+
Refer to the Beam Programming Guide "GroupByKey" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Map/FlatMapElements/task-remote-info.yaml b/learning/katas/java/Core Transforms/Map/FlatMapElements/task-remote-info.yaml index eccf16787840a..e237aa9e7b046 100644 --- a/learning/katas/java/Core Transforms/Map/FlatMapElements/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Map/FlatMapElements/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076150 -update_date: Fri, 07 Feb 2020 14:28:26 UTC +update_date: Tue, 19 May 2020 07:01:22 UTC diff --git a/learning/katas/java/Core Transforms/Map/FlatMapElements/task.html b/learning/katas/java/Core Transforms/Map/FlatMapElements/task.md similarity index 69% rename from learning/katas/java/Core Transforms/Map/FlatMapElements/task.html rename to learning/katas/java/Core Transforms/Map/FlatMapElements/task.md index 50f1627c0e7ef..8eb555cfdd42a 100644 --- a/learning/katas/java/Core Transforms/Map/FlatMapElements/task.html +++ b/learning/katas/java/Core Transforms/Map/FlatMapElements/task.md @@ -16,29 +16,24 @@ ~ limitations under the License. --> - -

FlatMapElements

-

- The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. -

-

- FlatMapElements can be used to simplify a DoFn that maps an element to multiple elements (one to - many). -

-

- Kata: Implement a function that maps each input sentence into words tokenized by whitespace - (" ") using - - FlatMapElements.into(...).via(...). -

-
+FlatMapElements +--------------- + +The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. + +FlatMapElements can be used to simplify a DoFn that maps an element to multiple elements (one to +many). + +**Kata:** Implement a function that maps each input sentence into words tokenized by whitespace +(" ") using [FlatMapElements.into(...).via(...)](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/FlatMapElements.html). +
Use FlatMapElements.into(...).via(...).
+
Refer to the Beam Programming Guide "Lightweight DoFns and other abstractions" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Map/MapElements/task-remote-info.yaml b/learning/katas/java/Core Transforms/Map/MapElements/task-remote-info.yaml index 644848f842731..ef561493f492d 100644 --- a/learning/katas/java/Core Transforms/Map/MapElements/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Map/MapElements/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076149 -update_date: Fri, 07 Feb 2020 14:28:22 UTC +update_date: Tue, 19 May 2020 07:01:19 UTC diff --git a/learning/katas/java/Core Transforms/Map/MapElements/task.html b/learning/katas/java/Core Transforms/Map/MapElements/task.md similarity index 71% rename from learning/katas/java/Core Transforms/Map/MapElements/task.html rename to learning/katas/java/Core Transforms/Map/MapElements/task.md index 68ae60cae2cf3..05b77f8d6c4a7 100644 --- a/learning/katas/java/Core Transforms/Map/MapElements/task.html +++ b/learning/katas/java/Core Transforms/Map/MapElements/task.md @@ -16,27 +16,23 @@ ~ limitations under the License. --> - -

MapElements

-

- The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. -

-

- MapElements can be used to simplify a DoFn that maps an element to another element (one to one). -

-

- Kata: Implement a simple map function that multiplies all input elements by 5 using - - MapElements.into(...).via(...). -

-
+MapElements +----------- + +The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. + +MapElements can be used to simplify a DoFn that maps an element to another element (one to one). + +**Kata:** Implement a simple map function that multiplies all input elements by 5 using +[MapElements.into(...).via(...)](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/MapElements.html). +
Use MapElements.into(...).via(...).
+
Refer to the Beam Programming Guide "Lightweight DoFns and other abstractions" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml b/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml index ae9acd805b2a2..c2a71f7bf3052 100644 --- a/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076148 -update_date: Fri, 07 Feb 2020 14:28:19 UTC +update_date: Tue, 19 May 2020 07:05:56 UTC diff --git a/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task.html b/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task.md similarity index 90% rename from learning/katas/java/Core Transforms/Map/ParDo OneToMany/task.html rename to learning/katas/java/Core Transforms/Map/ParDo OneToMany/task.md index b9e134e7c151d..013ef1d6f1af1 100644 --- a/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task.html +++ b/learning/katas/java/Core Transforms/Map/ParDo OneToMany/task.md @@ -16,23 +16,22 @@ ~ limitations under the License. --> - -

ParDo OneToMany

-

- Kata: Please write a ParDo that maps each input sentence into words tokenized by - whitespace (" "). -

-
+ParDo OneToMany +--------------- + +**Kata:** Please write a ParDo that maps each input sentence into words tokenized by whitespace +(" "). +
You can call OutputReceiver multiple times in a ParDo.
+
If you're using Beam version before v2.5.0, you can call DoFn.ProcessContext.output(..) multiple times in a ParDo.
- \ No newline at end of file diff --git a/learning/katas/java/Core Transforms/Map/ParDo/task-remote-info.yaml b/learning/katas/java/Core Transforms/Map/ParDo/task-remote-info.yaml index 89498764e53d6..60375db960619 100644 --- a/learning/katas/java/Core Transforms/Map/ParDo/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Map/ParDo/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076185 -update_date: Fri, 07 Feb 2020 14:28:05 UTC +update_date: Tue, 19 May 2020 07:01:14 UTC diff --git a/learning/katas/java/Core Transforms/Map/ParDo/task.html b/learning/katas/java/Core Transforms/Map/ParDo/task.md similarity index 71% rename from learning/katas/java/Core Transforms/Map/ParDo/task.html rename to learning/katas/java/Core Transforms/Map/ParDo/task.md index 15a0ea1fc68cb..bfcb16fdbb391 100644 --- a/learning/katas/java/Core Transforms/Map/ParDo/task.html +++ b/learning/katas/java/Core Transforms/Map/ParDo/task.md @@ -16,27 +16,25 @@ ~ limitations under the License. --> - -

ParDo

-

- ParDo is a Beam transform for generic parallel processing. The ParDo processing paradigm is - similar to the “Map” phase of a Map/Shuffle/Reduce-style algorithm: a ParDo transform considers - each element in the input PCollection, performs some processing function (your user code) on - that element, and emits zero, one, or multiple elements to an output PCollection. -

-

- Kata: Please write a simple ParDo that maps the input element by multiplying it by 10. -

-
+ParDo +----- + +ParDo is a Beam transform for generic parallel processing. The ParDo processing paradigm is similar +to the “Map” phase of a Map/Shuffle/Reduce-style algorithm: a ParDo transform considers each +element in the input PCollection, performs some processing function (your user code) on that +element, and emits zero, one, or multiple elements to an output PCollection. + +**Kata:** Please write a simple ParDo that maps the input element by multiplying it by 10. +
Use ParDo with DoFn.
+
Refer to the Beam Programming Guide "ParDo" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Partition/Partition/task-remote-info.yaml b/learning/katas/java/Core Transforms/Partition/Partition/task-remote-info.yaml index ad0c8123173d8..871b8da0e8ddc 100644 --- a/learning/katas/java/Core Transforms/Partition/Partition/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Partition/Partition/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076156 -update_date: Fri, 07 Feb 2020 14:06:48 UTC +update_date: Tue, 19 May 2020 07:01:46 UTC diff --git a/learning/katas/java/Core Transforms/Partition/Partition/task.html b/learning/katas/java/Core Transforms/Partition/Partition/task.md similarity index 59% rename from learning/katas/java/Core Transforms/Partition/Partition/task.html rename to learning/katas/java/Core Transforms/Partition/Partition/task.md index 96e559c99341d..e254afddc335f 100644 --- a/learning/katas/java/Core Transforms/Partition/Partition/task.html +++ b/learning/katas/java/Core Transforms/Partition/Partition/task.md @@ -16,33 +16,29 @@ ~ limitations under the License. --> - -

Partition

-

- Partition is a Beam transform for PCollection objects that store the same data type. - Partition splits a single PCollection into a fixed number of smaller collections. -

-

- Partition divides the elements of a PCollection according to a partitioning function - that you provide. The partitioning function contains the logic that determines how to split up - the elements of the input PCollection into each resulting partition PCollection. -

-

- Kata: Implement a - - Partition transform that splits a PCollection of numbers into two PCollections. - The first PCollection contains numbers greater than 100, and the second PCollection contains - the remaining numbers. -

-
+Partition +--------- + +Partition is a Beam transform for PCollection objects that store the same data type. Partition +splits a single PCollection into a fixed number of smaller collections. + +Partition divides the elements of a PCollection according to a partitioning function that you +provide. The partitioning function contains the logic that determines how to split up the elements +of the input PCollection into each resulting partition PCollection. + +**Kata:** Implement a +[Partition](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/transforms/Partition.html) +transform that splits a PCollection of numbers into two PCollections. The first PCollection +contains numbers greater than 100, and the second PCollection contains the remaining numbers. +
Refer to Partition to solve this problem.
+
Refer to the Beam Programming Guide "Partition" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Side Input/Side Input/task-remote-info.yaml b/learning/katas/java/Core Transforms/Side Input/Side Input/task-remote-info.yaml index f0673f8c00dd9..e9c76e40dfb49 100644 --- a/learning/katas/java/Core Transforms/Side Input/Side Input/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Side Input/Side Input/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076157 -update_date: Fri, 07 Feb 2020 14:06:55 UTC +update_date: Tue, 19 May 2020 07:01:49 UTC diff --git a/learning/katas/java/Core Transforms/Side Input/Side Input/task.html b/learning/katas/java/Core Transforms/Side Input/Side Input/task.md similarity index 65% rename from learning/katas/java/Core Transforms/Side Input/Side Input/task.html rename to learning/katas/java/Core Transforms/Side Input/Side Input/task.md index 9e7045b9476f9..6ebde279100ea 100644 --- a/learning/katas/java/Core Transforms/Side Input/Side Input/task.html +++ b/learning/katas/java/Core Transforms/Side Input/Side Input/task.md @@ -16,29 +16,27 @@ ~ limitations under the License. --> - -

Side Input

-

- In addition to the main input PCollection, you can provide additional inputs to a ParDo transform - in the form of side inputs. A side input is an additional input that your DoFn can access each - time it processes an element in the input PCollection. When you specify a side input, you create - a view of some other data that can be read from within the ParDo transform’s DoFn while - processing each element. -

-

- Side inputs are useful if your ParDo needs to inject additional data when processing each element - in the input PCollection, but the additional data needs to be determined at runtime (and not - hard-coded). Such values might be determined by the input data, or depend on a different branch - of your pipeline. -

-

- Kata: Please enrich each Person with the country based on the city he/she lives in. -

-
+Side Input +---------- + +In addition to the main input PCollection, you can provide additional inputs to a ParDo transform +in the form of side inputs. A side input is an additional input that your DoFn can access each time +it processes an element in the input PCollection. When you specify a side input, you create a view +of some other data that can be read from within the ParDo transform’s DoFn while processing each +element. + +Side inputs are useful if your ParDo needs to inject additional data when processing each element +in the input PCollection, but the additional data needs to be determined at runtime (and not +hard-coded). Such values might be determined by the input data, or depend on a different branch of +your pipeline. + +**Kata:** Please enrich each Person with the country based on the city he/she lives in. +
Use View to create PCollectionView of citiesToCountries.
+
Use ParDo with @@ -46,9 +44,9 @@

Side Input

side input.
+
Refer to the Beam Programming Guide "Side inputs" section for more information.
- diff --git a/learning/katas/java/Core Transforms/Side Output/Side Output/task-remote-info.yaml b/learning/katas/java/Core Transforms/Side Output/Side Output/task-remote-info.yaml index 4eab016d77d90..5260041c1bd48 100644 --- a/learning/katas/java/Core Transforms/Side Output/Side Output/task-remote-info.yaml +++ b/learning/katas/java/Core Transforms/Side Output/Side Output/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076158 -update_date: Fri, 07 Feb 2020 14:07:02 UTC +update_date: Tue, 19 May 2020 07:01:53 UTC diff --git a/learning/katas/java/Core Transforms/Side Output/Side Output/task.html b/learning/katas/java/Core Transforms/Side Output/Side Output/task.md similarity index 77% rename from learning/katas/java/Core Transforms/Side Output/Side Output/task.html rename to learning/katas/java/Core Transforms/Side Output/Side Output/task.md index d24f73d5253f8..d6902442bb539 100644 --- a/learning/katas/java/Core Transforms/Side Output/Side Output/task.html +++ b/learning/katas/java/Core Transforms/Side Output/Side Output/task.md @@ -16,18 +16,16 @@ ~ limitations under the License. --> - -

Side Output

-

- While ParDo always produces a main output PCollection (as the return value from apply), you can - also have your ParDo produce any number of additional output PCollections. If you choose to have - multiple outputs, your ParDo returns all of the output PCollections (including the main output) - bundled together. -

-

- Kata: Implement additional output to your ParDo for numbers bigger than 100. -

-
+Side Output +----------- + +While ParDo always produces a main output PCollection (as the return value from apply), you can +also have your ParDo produce any number of additional output PCollections. If you choose to have +multiple outputs, your ParDo returns all of the output PCollections (including the main output) +bundled together. + +**Kata:** Implement additional output to your ParDo for numbers bigger than 100. +
Use MultiOutputReceiver and @@ -36,9 +34,9 @@

Side Output

ParDo.
+
Refer to the Beam Programming Guide "Additional outputs" section for more information.
- diff --git a/learning/katas/java/Examples/Word Count/Word Count/task-remote-info.yaml b/learning/katas/java/Examples/Word Count/Word Count/task-remote-info.yaml index 224dbaee0acd3..35e7dde636da9 100644 --- a/learning/katas/java/Examples/Word Count/Word Count/task-remote-info.yaml +++ b/learning/katas/java/Examples/Word Count/Word Count/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076176 -update_date: Fri, 07 Feb 2020 14:08:54 UTC +update_date: Tue, 19 May 2020 07:02:51 UTC diff --git a/learning/katas/java/Examples/Word Count/Word Count/task.html b/learning/katas/java/Examples/Word Count/Word Count/task.md similarity index 78% rename from learning/katas/java/Examples/Word Count/Word Count/task.html rename to learning/katas/java/Examples/Word Count/Word Count/task.md index a963aab4cb1b2..d8dbdff4bef5d 100644 --- a/learning/katas/java/Examples/Word Count/Word Count/task.html +++ b/learning/katas/java/Examples/Word Count/Word Count/task.md @@ -16,21 +16,18 @@ ~ limitations under the License. --> - -

Word Count Pipeline

-

- Kata: Create a pipeline that counts the number of words. -

-

- Please output the count of each word in the following format: -

-
-  word:count
-  ball:5
-  book:3
-
-
+Word Count Pipeline +------------------- + +**Kata:** Create a pipeline that counts the number of words. + +Please output the count of each word in the following format: +```text +word:count +ball:5 +book:3 +``` +
Refer to your katas above.
- diff --git a/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml index ef8aff28455e2..caf130d69278d 100644 --- a/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml +++ b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076208 -update_date: Fri, 07 Feb 2020 14:52:55 UTC +update_date: Tue, 19 May 2020 07:02:32 UTC diff --git a/learning/katas/python/IO/Built-in IOs/Built-in IOs/task.html b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task.md similarity index 64% rename from learning/katas/python/IO/Built-in IOs/Built-in IOs/task.html rename to learning/katas/java/IO/Built-in IOs/Built-in IOs/task.md index ef1b2083a8e90..b083c732eeb1c 100644 --- a/learning/katas/python/IO/Built-in IOs/Built-in IOs/task.html +++ b/learning/katas/java/IO/Built-in IOs/Built-in IOs/task.md @@ -16,17 +16,14 @@ ~ limitations under the License. --> - -

Built-in I/Os

-

- Beam SDKs provide many out of the box I/O transforms that can be used to read from many - different sources and write to many different sinks. -

-

- See the Beam-provided I/O - Transforms page for a list of the currently available I/O transforms. -

-

- Note: There is no kata for this task. Please proceed to the next task. -

- +Built-in I/Os +------------- + +Beam SDKs provide many out of the box I/O transforms that can be used to read from many different +sources and write to many different sinks. + +See the [Beam-provided I/O Transforms](https://beam.apache.org/documentation/io/built-in/) page for +a list of the currently available I/O transforms. + +**Note:** There is no kata for this task. Please click the "Check" button and proceed to the next +task. \ No newline at end of file diff --git a/learning/katas/java/IO/TextIO/TextIO Read/task-remote-info.yaml b/learning/katas/java/IO/TextIO/TextIO Read/task-remote-info.yaml index d66bb01abd251..b904cbe131c8a 100644 --- a/learning/katas/java/IO/TextIO/TextIO Read/task-remote-info.yaml +++ b/learning/katas/java/IO/TextIO/TextIO Read/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076189 -update_date: Fri, 07 Feb 2020 14:28:15 UTC +update_date: Tue, 19 May 2020 07:06:02 UTC diff --git a/learning/katas/java/IO/TextIO/TextIO Read/task.html b/learning/katas/java/IO/TextIO/TextIO Read/task.md similarity index 62% rename from learning/katas/java/IO/TextIO/TextIO Read/task.html rename to learning/katas/java/IO/TextIO/TextIO Read/task.md index 1ebad84212594..23dd394cae82d 100644 --- a/learning/katas/java/IO/TextIO/TextIO Read/task.html +++ b/learning/katas/java/IO/TextIO/TextIO Read/task.md @@ -16,32 +16,29 @@ ~ limitations under the License. --> - -

TextIO Read

-

- When you create a pipeline, you often need to read data from some external source, such as a file - or a database. Likewise, you may want your pipeline to output its result data to an external - storage system. Beam provides read and write transforms for a number of common data storage types. - If you want your pipeline to read from or write to a data storage format that isn’t supported by - the built-in transforms, you can implement your own read and write transforms. -

-

- To read a PCollection from one or more text files, use TextIO.read() to instantiate a transform - and use TextIO.Read.from(String) to specify the path of the file(s) to be read. -

-

- Kata: Read the 'countries.txt' file and convert each country name into uppercase. -

-
+TextIO Read +----------- + +When you create a pipeline, you often need to read data from some external source, such as a file +or a database. Likewise, you may want your pipeline to output its result data to an external +storage system. Beam provides read and write transforms for a number of common data storage types. +If you want your pipeline to read from or write to a data storage format that isn’t supported by +the built-in transforms, you can implement your own read and write transforms. + +To read a PCollection from one or more text files, use TextIO.read() to instantiate a transform +and use TextIO.Read.from(String) to specify the path of the file(s) to be read. + +**Kata:** Read the 'countries.txt' file and convert each country name into uppercase. +
Use TextIO and its corresponding TextIO.read() method.
+
Refer to the Beam Programming Guide "Reading input data" section for more information.
- diff --git a/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml b/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml index 521fdb194b38e..f1bd95787b4ae 100644 --- a/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml +++ b/learning/katas/java/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076147 -update_date: Fri, 07 Feb 2020 14:05:53 UTC +update_date: Tue, 19 May 2020 07:01:10 UTC diff --git a/learning/katas/java/Introduction/Hello Beam/Hello Beam/task.html b/learning/katas/java/Introduction/Hello Beam/Hello Beam/task.md similarity index 51% rename from learning/katas/java/Introduction/Hello Beam/Hello Beam/task.html rename to learning/katas/java/Introduction/Hello Beam/Hello Beam/task.md index 28e579bf8f984..c1ef872d6d7a3 100644 --- a/learning/katas/java/Introduction/Hello Beam/Hello Beam/task.html +++ b/learning/katas/java/Introduction/Hello Beam/Hello Beam/task.md @@ -16,38 +16,34 @@ ~ limitations under the License. --> - -

Hello Beam Pipeline

-

- Apache Beam is an open source, unified model for defining both batch and streaming data-parallel - processing pipelines. Using one of the open source Beam SDKs, you build a program that defines the - pipeline. The pipeline is then executed by one of Beam’s supported distributed processing - back-ends, which include Apache Apex, Apache Flink, Apache Spark, and Google Cloud Dataflow. -

-

- Beam is particularly useful for Embarrassingly Parallel data processing tasks, in which the - problem can be decomposed into many smaller bundles of data that can be processed independently - and in parallel. You can also use Beam for Extract, Transform, and Load (ETL) tasks and pure data - integration. These tasks are useful for moving data between different storage media and data - sources, transforming data into a more desirable format, or loading data onto a new system. -

-

- To learn more about Apache Beam, refer to - Apache Beam Overview. -

-

- Kata: Your first kata is to create a simple pipeline that takes a hardcoded input element - "Hello Beam". -

-
+Welcome To Apache Beam +---------------------- + +Apache Beam is an open source, unified model for defining both batch and streaming data-parallel +processing pipelines. Using one of the open source Beam SDKs, you build a program that defines the +pipeline. The pipeline is then executed by one of Beam’s supported distributed processing back-ends, +which include Apache Apex, Apache Flink, Apache Spark, and Google Cloud Dataflow. + +Beam is particularly useful for Embarrassingly Parallel data processing tasks, in which the problem +can be decomposed into many smaller bundles of data that can be processed independently and in +parallel. You can also use Beam for Extract, Transform, and Load (ETL) tasks and pure data +integration. These tasks are useful for moving data between different storage media and data +sources, transforming data into a more desirable format, or loading data onto a new system. + +To learn more about Apache Beam, refer to +[Apache Beam Overview](https://beam.apache.org/get-started/beam-overview/). + +**Kata:** Your first kata is to create a simple pipeline that takes a hardcoded input element +"Hello Beam". +
Hardcoded input can be created using Create.
+
Refer to the Beam Programming Guide "Creating a PCollection from in-memory data" section for more information.
- diff --git a/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-remote-info.yaml b/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-remote-info.yaml index 823ad7b76d006..6ad36e81349f1 100644 --- a/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-remote-info.yaml +++ b/learning/katas/java/Triggers/Early Triggers/Early Triggers/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076174 -update_date: Fri, 07 Feb 2020 14:08:38 UTC +update_date: Tue, 19 May 2020 07:02:45 UTC diff --git a/learning/katas/java/Triggers/Early Triggers/Early Triggers/task.html b/learning/katas/java/Triggers/Early Triggers/Early Triggers/task.md similarity index 83% rename from learning/katas/java/Triggers/Early Triggers/Early Triggers/task.html rename to learning/katas/java/Triggers/Early Triggers/Early Triggers/task.md index 6a7f1cbad6c1c..12d579f319ed1 100644 --- a/learning/katas/java/Triggers/Early Triggers/Early Triggers/task.html +++ b/learning/katas/java/Triggers/Early Triggers/Early Triggers/task.md @@ -16,44 +16,45 @@ ~ limitations under the License. --> - -

Early Triggers

-

- Triggers allow Beam to emit early results, before all the data in a given window has arrived. - For example, emitting after a certain amount of time elapses, or after a certain number of - elements arrives. -

-

- Kata: Given that events are being generated every second and a fixed window of 1-day - duration, please implement an early trigger that emits the number of events count immediately - after new element is processed. -

-
+Early Triggers +-------------- + +Triggers allow Beam to emit early results, before all the data in a given window has arrived. For +example, emitting after a certain amount of time elapses, or after a certain number of elements +arrives. + +**Kata:** Given that events are being generated every second and a fixed window of 1-day duration, +please implement an early trigger that emits the number of events count immediately after new +element is processed. +
Use withEarlyFirings to set early firing triggers.
+
Use FixedWindows with 1-day duration using AfterWatermark.pastEndOfWindow() trigger.
+
Set the allowed lateness to 0 with discarding accumulation mode.
+
Use Combine.globally and Count.combineFn to calculate the count of events.
+
Refer to the Beam Programming Guide "Event time triggers" section for more information.
- \ No newline at end of file diff --git a/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-remote-info.yaml b/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-remote-info.yaml index 56a53bb168618..d4d5c9da9d10f 100644 --- a/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-remote-info.yaml +++ b/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076173 -update_date: Fri, 07 Feb 2020 14:08:31 UTC +update_date: Tue, 19 May 2020 07:02:42 UTC diff --git a/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task.html b/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task.md similarity index 59% rename from learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task.html rename to learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task.md index 5a124aa3ccb40..273ac8f637b7c 100644 --- a/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task.html +++ b/learning/katas/java/Triggers/Event Time Triggers/Event Time Triggers/task.md @@ -16,63 +16,57 @@ ~ limitations under the License. --> - -

Event Time Triggers

-

- When collecting and grouping data into windows, Beam uses triggers to determine when to emit the - aggregated results of each window (referred to as a pane). If you use Beam’s default windowing - configuration and default trigger, Beam outputs the aggregated result when it estimates all data - has arrived, and discards all subsequent data for that window. -

-

- You can set triggers for your PCollections to change this default behavior. Beam provides a - number of pre-built triggers that you can set: -

-
- -
-

- Event time triggers operate on the event time, as indicated by the timestamp on each data - element. Beam’s default trigger is event time-based. -

-

- The AfterWatermark trigger operates on event time. The AfterWatermark trigger emits the contents - of a window after the watermark passes the end of the window, based on the timestamps attached - to the data elements. The watermark is a global progress metric, and is Beam’s notion of input - completeness within your pipeline at any given point. AfterWatermark.pastEndOfWindow() only fires - when the watermark passes the end of the window. -

-

- Kata: Given that events are being generated every second, please implement a trigger that - emits the number of events count within a fixed window of 5-second duration. -

-
+Event Time Triggers +------------------- + +When collecting and grouping data into windows, Beam uses triggers to determine when to emit the +aggregated results of each window (referred to as a pane). If you use Beam’s default windowing +configuration and default trigger, Beam outputs the aggregated result when it estimates all data +has arrived, and discards all subsequent data for that window. + +You can set triggers for your PCollections to change this default behavior. Beam provides a number +of pre-built triggers that you can set: + +* Event time triggers +* Processing time triggers +* Data-driven triggers +* Composite triggers + +Event time triggers operate on the event time, as indicated by the timestamp on each data element. +Beam’s default trigger is event time-based. + +The AfterWatermark trigger operates on event time. The AfterWatermark trigger emits the contents +of a window after the watermark passes the end of the window, based on the timestamps attached to +the data elements. The watermark is a global progress metric, and is Beam’s notion of input +completeness within your pipeline at any given point. AfterWatermark.pastEndOfWindow() only fires +when the watermark passes the end of the window. + +**Kata:** Given that events are being generated every second, please implement a trigger that emits +the number of events count within a fixed window of 5-second duration. +
Use FixedWindows with 5-second duration using AfterWatermark.pastEndOfWindow() trigger.
+
Set the allowed lateness to 0 with discarding accumulation mode.
+
Use Combine.globally and Count.combineFn to calculate the count of events.
+
Refer to the Beam Programming Guide "Event time triggers" section for more information.
- diff --git a/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-remote-info.yaml b/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-remote-info.yaml index f69bd76fa9859..75c5c3e2be6b2 100644 --- a/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-remote-info.yaml +++ b/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076175 -update_date: Fri, 07 Feb 2020 14:08:45 UTC +update_date: Tue, 19 May 2020 07:02:47 UTC diff --git a/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task.html b/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task.md similarity index 80% rename from learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task.html rename to learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task.md index f40784eabdc60..90f44861899c5 100644 --- a/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task.html +++ b/learning/katas/java/Triggers/Window Accumulation Mode/Window Accumulation Mode/task.md @@ -16,48 +16,50 @@ ~ limitations under the License. --> - -

Window Accumulation Mode

-

- When you specify a trigger, you must also set the the window’s accumulation mode. When a trigger - fires, it emits the current contents of the window as a pane. Since a trigger can fire multiple - times, the accumulation mode determines whether the system accumulates the window panes as the - trigger fires, or discards them. -

-

- Kata: Given that events are being generated every second and a fixed window of 1-day - duration, please implement an early trigger that emits the number of events count immediately - after new element is processed in accumulating mode. -

-
+Window Accumulation Mode +------------------------ + +When you specify a trigger, you must also set the the window’s accumulation mode. When a trigger +fires, it emits the current contents of the window as a pane. Since a trigger can fire multiple +times, the accumulation mode determines whether the system accumulates the window panes as the +trigger fires, or discards them. + +**Kata:** Given that events are being generated every second and a fixed window of 1-day duration, +please implement an early trigger that emits the number of events count immediately after new +element is processed in accumulating mode. +
Use accumulatingFiredPanes() to set a window to accumulate the panes that are produced when the trigger fires.
+
Use withEarlyFirings to set early firing triggers.
+
Use FixedWindows with 1-day duration using AfterWatermark.pastEndOfWindow() trigger.
+
Set the allowed lateness to 0.
+
Use Combine.globally and Count.combineFn to calculate the count of events.
+
Refer to the Beam Programming Guide "Event time triggers" section for more information.
- diff --git a/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml b/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml index 6541f3ccc803f..ffc33a03d3788 100644 --- a/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml +++ b/learning/katas/java/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076170 -update_date: Fri, 07 Feb 2020 14:08:15 UTC +update_date: Tue, 19 May 2020 07:02:35 UTC diff --git a/learning/katas/java/Windowing/Adding Timestamp/ParDo/task.html b/learning/katas/java/Windowing/Adding Timestamp/ParDo/task.md similarity index 76% rename from learning/katas/java/Windowing/Adding Timestamp/ParDo/task.html rename to learning/katas/java/Windowing/Adding Timestamp/ParDo/task.md index 403fc11d0d0a2..c801480f5e652 100644 --- a/learning/katas/java/Windowing/Adding Timestamp/ParDo/task.html +++ b/learning/katas/java/Windowing/Adding Timestamp/ParDo/task.md @@ -16,33 +16,31 @@ ~ limitations under the License. --> - -

Adding Timestamp - ParDo

-

- Bounded sources (such as a file from TextIO) do not provide timestamps for elements. If you need - timestamps, you must add them to your PCollection’s elements. -

-

- You can assign new timestamps to the elements of a PCollection by applying a ParDo transform that - outputs new elements with timestamps that you set. -

-

- Kata: Please assign each element a timestamp based on the the Event.getDate(). -

-
+Adding Timestamp - ParDo +------------------------ + +Bounded sources (such as a file from TextIO) do not provide timestamps for elements. If you need +timestamps, you must add them to your PCollection’s elements. + +You can assign new timestamps to the elements of a PCollection by applying a ParDo transform that +outputs new elements with timestamps that you set. + +**Kata:** Please assign each element a timestamp based on the the `Event.getDate()`. +
Use ParDo with DoFn.
+
Use OutputReceiver.outputWithTimestamp method to assign timestamp to the element.
+
Refer to the Beam Programming Guide "Adding timestamps to a PCollection’s elements" section for more information.
- diff --git a/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-remote-info.yaml b/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-remote-info.yaml index 5e8431f094d31..c4ecf093ad696 100644 --- a/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-remote-info.yaml +++ b/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076190 -update_date: Fri, 07 Feb 2020 14:28:17 UTC +update_date: Tue, 19 May 2020 07:06:05 UTC diff --git a/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task.html b/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task.md similarity index 71% rename from learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task.html rename to learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task.md index bd49a7424fb31..1fdede83349f0 100644 --- a/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task.html +++ b/learning/katas/java/Windowing/Adding Timestamp/WithTimestamps/task.md @@ -16,27 +16,24 @@ ~ limitations under the License. --> - -

Adding Timestamp - WithTimestamps

-

- Bounded sources (such as a file from TextIO) do not provide timestamps for elements. If you need - timestamps, you must add them to your PCollection’s elements. -

-

- You can assign new timestamps to the elements of a PCollection by applying a ParDo transform that - outputs new elements with timestamps that you set. -

-

- Kata: Please assign each element a timestamp based on the the Event.getDate(). -

-
+Adding Timestamp - WithTimestamps +--------------------------------- + +Bounded sources (such as a file from TextIO) do not provide timestamps for elements. If you need +timestamps, you must add them to your PCollection’s elements. + +You can assign new timestamps to the elements of a PCollection by applying a ParDo transform that +outputs new elements with timestamps that you set. + +**Kata:** Please assign each element a timestamp based on the the `Event.getDate()`. +
Use WithTimestamps.
+
Refer to the Beam Programming Guide "Adding timestamps to a PCollection’s elements" section for more information.
- \ No newline at end of file diff --git a/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml b/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml index 14e82e16ea427..574b554cfff00 100644 --- a/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml +++ b/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076172 -update_date: Fri, 07 Feb 2020 14:08:23 UTC +update_date: Tue, 19 May 2020 07:02:39 UTC diff --git a/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task.html b/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task.html deleted file mode 100644 index 7f010c79b63ed..0000000000000 --- a/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task.html +++ /dev/null @@ -1,61 +0,0 @@ - - - -

Fixed Time Window

-

- Windowing subdivides a PCollection according to the timestamps of its individual elements. - Transforms that aggregate multiple elements, such as GroupByKey and Combine, work implicitly on - a per-window basis — they process each PCollection as a succession of multiple, finite windows, - though the entire collection itself may be of unbounded size. -

-

- In the Beam model, any PCollection (including unbounded PCollections) can be subdivided into - logical windows. Each element in a PCollection is assigned to one or more windows according to - the PCollection’s windowing function, and each individual window contains a finite number of - elements. Grouping transforms then consider each PCollection’s elements on a per-window basis. - GroupByKey, for example, implicitly groups the elements of a PCollection by key and window. -

-
- Beam provides several windowing functions, including: - -
-

- The simplest form of windowing is using fixed time windows. A fixed time window represents a - consistent duration, non overlapping time interval in the data stream. -

-

- Kata: Please count the number of events that happened based on fixed window with - 1-day duration. -

-
-
- Use - FixedWindows with 1-day duration. -
-
- Refer to the Beam Programming Guide - - "Fixed time windows" section for more information. -
- \ No newline at end of file diff --git a/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task.md b/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task.md new file mode 100644 index 0000000000000..ff4c19c285a5e --- /dev/null +++ b/learning/katas/java/Windowing/Fixed Time Window/Fixed Time Window/task.md @@ -0,0 +1,53 @@ + + +Fixed Time Window +----------------- + +Windowing subdivides a PCollection according to the timestamps of its individual elements. +Transforms that aggregate multiple elements, such as GroupByKey and Combine, work implicitly on a +per-window basis — they process each PCollection as a succession of multiple, finite windows, +though the entire collection itself may be of unbounded size. + +In the Beam model, any PCollection (including unbounded PCollections) can be subdivided into +logical windows. Each element in a PCollection is assigned to one or more windows according to the +PCollection’s windowing function, and each individual window contains a finite number of elements. +Grouping transforms then consider each PCollection’s elements on a per-window basis. GroupByKey, +for example, implicitly groups the elements of a PCollection by key and window. + +Beam provides several windowing functions, including: +* Fixed Time Windows +* Sliding Time Windows +* Per-Session Windows +* Single Global Window + +The simplest form of windowing is using fixed time windows. A fixed time window represents a +consistent duration, non overlapping time interval in the data stream. + +**Kata:** Please count the number of events that happened based on fixed window with 1-day duration. + +
+ Use + FixedWindows with 1-day duration. +
+ +
+ Refer to the Beam Programming Guide + + "Fixed time windows" section for more information. +
diff --git a/learning/katas/java/course-remote-info.yaml b/learning/katas/java/course-remote-info.yaml index ce1e82e6db858..abf1f71e0788a 100644 --- a/learning/katas/java/course-remote-info.yaml +++ b/learning/katas/java/course-remote-info.yaml @@ -1,2 +1,2 @@ id: 54530 -update_date: Fri, 07 Feb 2020 14:05:35 UTC +update_date: Tue, 19 May 2020 06:52:23 UTC diff --git a/learning/katas/python/Common Transforms/Aggregation/Count/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Count/task-remote-info.yaml index f3557d3fc128d..410c0831e35b5 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Count/task-remote-info.yaml +++ b/learning/katas/python/Common Transforms/Aggregation/Count/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755597 -update_date: Fri, 07 Feb 2020 13:57:24 UTC +update_date: Tue, 19 May 2020 03:05:33 UTC diff --git a/learning/katas/python/Common Transforms/Aggregation/Count/task.html b/learning/katas/python/Common Transforms/Aggregation/Count/task.md similarity index 89% rename from learning/katas/python/Common Transforms/Aggregation/Count/task.html rename to learning/katas/python/Common Transforms/Aggregation/Count/task.md index b9ad594a7ff81..810678bbc9593 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Count/task.html +++ b/learning/katas/python/Common Transforms/Aggregation/Count/task.md @@ -16,14 +16,12 @@ ~ limitations under the License. --> - -

Aggregation - Count

-

- Kata: Count the number of elements from an input. -

-
+Aggregation - Count +------------------- + +**Kata:** Count the number of elements from an input. +
Use Count.
- diff --git a/learning/katas/python/Common Transforms/Aggregation/Count/tests.py b/learning/katas/python/Common Transforms/Aggregation/Count/tests.py index ccc96cb6e5aee..3cdcee10214ee 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Count/tests.py +++ b/learning/katas/python/Common Transforms/Aggregation/Count/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_is_not_empty, test_answer_placeholders_text_deleted +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -31,5 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/Common Transforms/Aggregation/Largest/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Largest/task-remote-info.yaml index a586df3b2765f..b5dd948d8a7a2 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Largest/task-remote-info.yaml +++ b/learning/katas/python/Common Transforms/Aggregation/Largest/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755601 -update_date: Fri, 07 Feb 2020 13:57:37 UTC +update_date: Tue, 19 May 2020 03:05:45 UTC diff --git a/learning/katas/python/Common Transforms/Aggregation/Largest/task.html b/learning/katas/python/Common Transforms/Aggregation/Largest/task.md similarity index 89% rename from learning/katas/python/Common Transforms/Aggregation/Largest/task.html rename to learning/katas/python/Common Transforms/Aggregation/Largest/task.md index 9c9fe4f609ebf..47716d7e6f6dc 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Largest/task.html +++ b/learning/katas/python/Common Transforms/Aggregation/Largest/task.md @@ -16,14 +16,12 @@ ~ limitations under the License. --> - -

Aggregation - Largest

-

- Kata: Compute the largest of the elements from an input. -

-
+Aggregation - Largest +--------------------- + +**Kata:** Compute the largest of the elements from an input. +
Use Top.Largest.
- diff --git a/learning/katas/python/Common Transforms/Aggregation/Largest/tests.py b/learning/katas/python/Common Transforms/Aggregation/Largest/tests.py index 0fced6e38d784..9a420cdb93710 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Largest/tests.py +++ b/learning/katas/python/Common Transforms/Aggregation/Largest/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_is_not_empty, test_answer_placeholders_text_deleted +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -31,5 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/Common Transforms/Aggregation/Mean/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Mean/task-remote-info.yaml index c26a06215eb71..c89e88a793e25 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Mean/task-remote-info.yaml +++ b/learning/katas/python/Common Transforms/Aggregation/Mean/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755599 -update_date: Fri, 07 Feb 2020 13:57:30 UTC +update_date: Tue, 19 May 2020 03:05:39 UTC diff --git a/learning/katas/python/Common Transforms/Aggregation/Mean/task.html b/learning/katas/python/Common Transforms/Aggregation/Mean/task.md similarity index 88% rename from learning/katas/python/Common Transforms/Aggregation/Mean/task.html rename to learning/katas/python/Common Transforms/Aggregation/Mean/task.md index 2434ba29cd4a5..153c550144e1a 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Mean/task.html +++ b/learning/katas/python/Common Transforms/Aggregation/Mean/task.md @@ -16,14 +16,12 @@ ~ limitations under the License. --> - -

Aggregation - Mean

-

- Kata: Compute the mean/average of all elements from an input. -

-
+Aggregation - Mean +------------------ + +**Kata:** Compute the mean/average of all elements from an input. +
Use Mean.
- diff --git a/learning/katas/python/Common Transforms/Aggregation/Mean/tests.py b/learning/katas/python/Common Transforms/Aggregation/Mean/tests.py index 9e99d3971d2cc..0f21c4886a6cf 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Mean/tests.py +++ b/learning/katas/python/Common Transforms/Aggregation/Mean/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_is_not_empty, test_answer_placeholders_text_deleted +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -31,5 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/Common Transforms/Aggregation/Smallest/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Smallest/task-remote-info.yaml index e96b8565e22b4..68f18b39ac746 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Smallest/task-remote-info.yaml +++ b/learning/katas/python/Common Transforms/Aggregation/Smallest/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755600 -update_date: Fri, 07 Feb 2020 13:57:34 UTC +update_date: Tue, 19 May 2020 03:05:42 UTC diff --git a/learning/katas/python/Common Transforms/Aggregation/Smallest/task.html b/learning/katas/python/Common Transforms/Aggregation/Smallest/task.md similarity index 88% rename from learning/katas/python/Common Transforms/Aggregation/Smallest/task.html rename to learning/katas/python/Common Transforms/Aggregation/Smallest/task.md index 49cc7ad0dc3c6..44cf28fab3271 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Smallest/task.html +++ b/learning/katas/python/Common Transforms/Aggregation/Smallest/task.md @@ -16,14 +16,12 @@ ~ limitations under the License. --> - -

Aggregation - Smallest

-

- Kata: Compute the smallest of the elements from an input. -

-
+Aggregation - Smallest +---------------------- + +**Kata:** Compute the smallest of the elements from an input. +
Use Top.Smallest.
- diff --git a/learning/katas/python/Common Transforms/Aggregation/Smallest/tests.py b/learning/katas/python/Common Transforms/Aggregation/Smallest/tests.py index 5bfa80c57c3f3..0e574670e5321 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Smallest/tests.py +++ b/learning/katas/python/Common Transforms/Aggregation/Smallest/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_answer_placeholders_text_deleted, test_is_not_empty +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -31,5 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/Common Transforms/Aggregation/Sum/task-remote-info.yaml b/learning/katas/python/Common Transforms/Aggregation/Sum/task-remote-info.yaml index ecdb0315171e9..4a01df2f869e4 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Sum/task-remote-info.yaml +++ b/learning/katas/python/Common Transforms/Aggregation/Sum/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755598 -update_date: Fri, 07 Feb 2020 13:57:27 UTC +update_date: Tue, 19 May 2020 03:05:36 UTC diff --git a/learning/katas/python/Common Transforms/Aggregation/Sum/task.html b/learning/katas/python/Common Transforms/Aggregation/Sum/task.md similarity index 90% rename from learning/katas/python/Common Transforms/Aggregation/Sum/task.html rename to learning/katas/python/Common Transforms/Aggregation/Sum/task.md index 50f5b9962cb13..c8e018fa42f22 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Sum/task.html +++ b/learning/katas/python/Common Transforms/Aggregation/Sum/task.md @@ -16,15 +16,13 @@ ~ limitations under the License. --> - -

Aggregation - Sum

-

- Kata: Compute the sum of all elements from an input. -

-
+Aggregation - Sum +----------------- + +**Kata:** Compute the sum of all elements from an input. +
Use CombineGlobally and Python built-in sum.
- diff --git a/learning/katas/python/Common Transforms/Aggregation/Sum/tests.py b/learning/katas/python/Common Transforms/Aggregation/Sum/tests.py index e761cdf5beab7..30964e7995613 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Sum/tests.py +++ b/learning/katas/python/Common Transforms/Aggregation/Sum/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_is_not_empty, test_answer_placeholders_text_deleted +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -31,5 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/Common Transforms/Filter/Filter/task-remote-info.yaml b/learning/katas/python/Common Transforms/Filter/Filter/task-remote-info.yaml index c13409cde4e1a..b128f6e993e23 100644 --- a/learning/katas/python/Common Transforms/Filter/Filter/task-remote-info.yaml +++ b/learning/katas/python/Common Transforms/Filter/Filter/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755596 -update_date: Fri, 07 Feb 2020 13:57:21 UTC +update_date: Tue, 19 May 2020 03:05:30 UTC diff --git a/learning/katas/python/Common Transforms/Filter/Filter/task.html b/learning/katas/python/Common Transforms/Filter/Filter/task.md similarity index 72% rename from learning/katas/python/Common Transforms/Filter/Filter/task.html rename to learning/katas/python/Common Transforms/Filter/Filter/task.md index 797f77c32ef90..2092263d6009a 100644 --- a/learning/katas/python/Common Transforms/Filter/Filter/task.html +++ b/learning/katas/python/Common Transforms/Filter/Filter/task.md @@ -16,19 +16,15 @@ ~ limitations under the License. --> - -

Filter

-

- The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. -

-

- Kata: Implement a filter function that filters out the odd numbers by using - - Filter. -

-
+Filter +------ + +The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. + +**Kata:** Implement a filter function that filters out the odd numbers by using +[Filter](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Filter). +
Use Filter with a lambda.
- diff --git a/learning/katas/python/Common Transforms/Filter/Filter/tests.py b/learning/katas/python/Common Transforms/Filter/Filter/tests.py index 03487d2def126..da8cd880051fd 100644 --- a/learning/katas/python/Common Transforms/Filter/Filter/tests.py +++ b/learning/katas/python/Common Transforms/Filter/Filter/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_filter(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'beam.Filter' in placeholder: - passed() - else: - failed('Use beam.Filter') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -42,6 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_filter() test_output() diff --git a/learning/katas/python/Common Transforms/Filter/ParDo/task-remote-info.yaml b/learning/katas/python/Common Transforms/Filter/ParDo/task-remote-info.yaml index c897e3ac176bb..227501c0853f8 100644 --- a/learning/katas/python/Common Transforms/Filter/ParDo/task-remote-info.yaml +++ b/learning/katas/python/Common Transforms/Filter/ParDo/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755595 -update_date: Fri, 07 Feb 2020 13:57:19 UTC +update_date: Tue, 19 May 2020 03:05:27 UTC diff --git a/learning/katas/python/Common Transforms/Filter/ParDo/task.html b/learning/katas/python/Common Transforms/Filter/ParDo/task.md similarity index 79% rename from learning/katas/python/Common Transforms/Filter/ParDo/task.html rename to learning/katas/python/Common Transforms/Filter/ParDo/task.md index 1c4ea1be03d5b..2204a8d88b835 100644 --- a/learning/katas/python/Common Transforms/Filter/ParDo/task.html +++ b/learning/katas/python/Common Transforms/Filter/ParDo/task.md @@ -16,16 +16,13 @@ ~ limitations under the License. --> - -

Filter using ParDo

-

- Kata: Implement a filter function that filters out the even numbers by using - - ParDo. -

-
+Filter using ParDo +------------------ + +**Kata:** Implement a filter function that filters out the even numbers by using +[ParDo](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.ParDo). +
Override process method. You can use "yield" for each intended element.
- diff --git a/learning/katas/python/Common Transforms/Filter/ParDo/tests.py b/learning/katas/python/Common Transforms/Filter/ParDo/tests.py index b1d475bd3bdcd..0a2cbba3efe7b 100644 --- a/learning/katas/python/Common Transforms/Filter/ParDo/tests.py +++ b/learning/katas/python/Common Transforms/Filter/ParDo/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_is_not_empty, test_answer_placeholders_text_deleted +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -31,5 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml b/learning/katas/python/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml index ceb1f3daea623..686b9b79ba0a8 100644 --- a/learning/katas/python/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml +++ b/learning/katas/python/Common Transforms/WithKeys/WithKeys/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1124221 -update_date: Mon, 09 Mar 2020 14:34:20 UTC +update_date: Tue, 19 May 2020 03:05:49 UTC diff --git a/learning/katas/python/Common Transforms/WithKeys/WithKeys/task.html b/learning/katas/python/Common Transforms/WithKeys/WithKeys/task.md similarity index 85% rename from learning/katas/python/Common Transforms/WithKeys/WithKeys/task.html rename to learning/katas/python/Common Transforms/WithKeys/WithKeys/task.md index d02b933591678..820e3a4b01a9b 100644 --- a/learning/katas/python/Common Transforms/WithKeys/WithKeys/task.html +++ b/learning/katas/python/Common Transforms/WithKeys/WithKeys/task.md @@ -16,15 +16,13 @@ ~ limitations under the License. --> - -

WithKeys

-

- Kata: Convert each fruit name into a KV of its first letter and itself, e.g. - apple => ('a', 'apple') -

-
+WithKeys +-------- + +**Kata:** Convert each fruit name into a KV of its first letter and itself, e.g. +`apple => ('a', 'apple')` +
Use WithKeys.
- diff --git a/learning/katas/python/Common Transforms/WithKeys/WithKeys/tests.py b/learning/katas/python/Common Transforms/WithKeys/WithKeys/tests.py index a3059f27bfa2f..7b37e800894f5 100644 --- a/learning/katas/python/Common Transforms/WithKeys/WithKeys/tests.py +++ b/learning/katas/python/Common Transforms/WithKeys/WithKeys/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_filter(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'beam.WithKeys' in placeholder: - passed() - else: - failed('Use beam.WithKeys') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -44,6 +32,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_filter() test_output() diff --git a/learning/katas/python/Core Transforms/Branching/Branching/task-remote-info.yaml b/learning/katas/python/Core Transforms/Branching/Branching/task-remote-info.yaml index 28b3f94853f64..bc28ecce41238 100644 --- a/learning/katas/python/Core Transforms/Branching/Branching/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Branching/Branching/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755592 -update_date: Fri, 07 Feb 2020 13:57:12 UTC +update_date: Tue, 19 May 2020 03:05:20 UTC diff --git a/learning/katas/python/Core Transforms/Branching/Branching/task.html b/learning/katas/python/Core Transforms/Branching/Branching/task.md similarity index 76% rename from learning/katas/python/Core Transforms/Branching/Branching/task.html rename to learning/katas/python/Core Transforms/Branching/Branching/task.md index 12d9645aa03a5..293dec70d4e04 100644 --- a/learning/katas/python/Core Transforms/Branching/Branching/task.html +++ b/learning/katas/python/Core Transforms/Branching/Branching/task.md @@ -16,20 +16,16 @@ ~ limitations under the License. --> - -

Branching

-

- You can use the same PCollection as input for multiple transforms without consuming the input - or altering it. -

-

- Kata: Branch out the numbers to two different transforms: one transform is multiplying - each number by 5 and the other transform is multiplying each number by 10. -

-
+Branching +--------- + +You can use the same PCollection as input for multiple transforms without consuming the input or +altering it. + +**Kata:** Branch out the numbers to two different transforms: one transform is multiplying each +number by 5 and the other transform is multiplying each number by 10.
Refer to the Beam Design Your Pipeline Guide "Multiple transforms process the same PCollection" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Branching/Branching/tests.py b/learning/katas/python/Core Transforms/Branching/Branching/tests.py index de1fea6ac46a3..6df8cd5fdee77 100644 --- a/learning/katas/python/Core Transforms/Branching/Branching/tests.py +++ b/learning/katas/python/Core Transforms/Branching/Branching/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_is_not_empty, test_answer_placeholders_text_deleted +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -41,5 +40,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml index cc40fdd3a88c2..43038c403b788 100644 --- a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755583 -update_date: Fri, 07 Feb 2020 13:56:44 UTC +update_date: Tue, 19 May 2020 03:04:56 UTC diff --git a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.html b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.md similarity index 69% rename from learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.html rename to learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.md index 5c7ecf2a81a9a..dc2da5ffabfd2 100644 --- a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.html +++ b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.md @@ -16,27 +16,25 @@ ~ limitations under the License. --> - -

CoGroupByKey

-

- CoGroupByKey performs a relational join of two or more key/value PCollections that have the same - key type. -

-

- Kata: Implement a - - CoGroupByKey transform that join words by its first alphabetical letter, and then produces - the string representation of the WordsAlphabet model. -

-
+CoGroupByKey +------------ + +CoGroupByKey performs a relational join of two or more key/value PCollections that have the same +key type. + +**Kata:** Implement a +[CoGroupByKey](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.CoGroupByKey) +transform that join words by its first alphabetical letter, and then produces the string +representation of the WordsAlphabet model. +
Refer to - CoGroupByKeyto solve this problem. + CoGroupByKey to solve this problem.
+
Refer to the Beam Programming Guide "CoGroupByKey" section for more information.
- diff --git a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/tests.py b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/tests.py index da12782656665..16e0501187ef4 100644 --- a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/tests.py +++ b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_is_not_empty, test_answer_placeholders_text_deleted +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -35,5 +34,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml index eafecc41c2316..c7f007a4f2e12 100644 --- a/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755587 -update_date: Fri, 07 Feb 2020 13:56:58 UTC +update_date: Tue, 19 May 2020 03:05:05 UTC diff --git a/learning/katas/python/Core Transforms/Combine/Combine PerKey/task.html b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task.md similarity index 67% rename from learning/katas/python/Core Transforms/Combine/Combine PerKey/task.html rename to learning/katas/python/Core Transforms/Combine/Combine PerKey/task.md index 044aae7f9b1a2..c52512b823ab1 100644 --- a/learning/katas/python/Core Transforms/Combine/Combine PerKey/task.html +++ b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task.md @@ -16,33 +16,31 @@ ~ limitations under the License. --> - -

Combine - Combine PerKey

-

- After creating a keyed PCollection (for example, by using a GroupByKey transform), a common - pattern is to combine the collection of values associated with each key into a single, merged - value. This pattern of a GroupByKey followed by merging the collection of values is equivalent to - Combine PerKey transform. The combine function you supply to Combine PerKey must be an associative - reduction function or a subclass of CombineFn. -

-

- Kata: Implement the sum of scores per player using - - CombinePerKey. -

-
+Combine - Combine PerKey +------------------------ + +After creating a keyed PCollection (for example, by using a GroupByKey transform), a common pattern +is to combine the collection of values associated with each key into a single, merged value. This +pattern of a GroupByKey followed by merging the collection of values is equivalent to Combine PerKey +transform. The combine function you supply to Combine PerKey must be an associative reduction +function or a subclass of CombineFn. + +**Kata:** Implement the sum of scores per player using +[CombinePerKey](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombinePerKey). +
Use CombinePerKey(CombineFn).
+
Extend the CombineFn class that counts the sum of the number.
+
Refer to the Beam Programming Guide "Combining values in a keyed PCollection" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Combine/Combine PerKey/tests.py b/learning/katas/python/Core Transforms/Combine/Combine PerKey/tests.py index e8042837ed227..cd6ab2e6638f6 100644 --- a/learning/katas/python/Core Transforms/Combine/Combine PerKey/tests.py +++ b/learning/katas/python/Core Transforms/Combine/Combine PerKey/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_combine_placeholders(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'beam.CombinePerKey' in placeholder: - passed() - else: - failed('Use beam.CombinePerKey') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -47,6 +35,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_combine_placeholders() test_output() diff --git a/learning/katas/python/Core Transforms/Combine/CombineFn/task-remote-info.yaml b/learning/katas/python/Core Transforms/Combine/CombineFn/task-remote-info.yaml index 79a29f2a4048b..8330f0553036a 100644 --- a/learning/katas/python/Core Transforms/Combine/CombineFn/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Combine/CombineFn/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755585 -update_date: Fri, 07 Feb 2020 13:56:52 UTC +update_date: Tue, 19 May 2020 03:06:40 UTC diff --git a/learning/katas/python/Core Transforms/Combine/CombineFn/task.html b/learning/katas/python/Core Transforms/Combine/CombineFn/task.md similarity index 51% rename from learning/katas/python/Core Transforms/Combine/CombineFn/task.html rename to learning/katas/python/Core Transforms/Combine/CombineFn/task.md index 4828e0f6a1e9a..b083774a7558e 100644 --- a/learning/katas/python/Core Transforms/Combine/CombineFn/task.html +++ b/learning/katas/python/Core Transforms/Combine/CombineFn/task.md @@ -16,37 +16,32 @@ ~ limitations under the License. --> - -

Combine - CombineFn

-

- Combine is a Beam transform for combining collections of elements or values in your data. - When you apply a Combine transform, you must provide the function that contains the logic for - combining the elements or values. The combining function should be commutative and associative, - as the function is not necessarily invoked exactly once on all values with a given key. Because - the input data (including the value collection) may be distributed across multiple workers, the - combining function might be called multiple times to perform partial combining on subsets of - the value collection. -

-

- Complex combination operations might require you to create a subclass of CombineFn that has an - accumulation type distinct from the input/output type. You should use CombineFn if the combine - function requires a more sophisticated accumulator, must perform additional pre- or - post-processing, might change the output type, or takes the key into account. -

-

- Kata: Implement the average of numbers using - - Combine.CombineFn. -

-
+Combine - CombineFn +------------------- + +Combine is a Beam transform for combining collections of elements or values in your data. When you +apply a Combine transform, you must provide the function that contains the logic for combining the +elements or values. The combining function should be commutative and associative, as the function +is not necessarily invoked exactly once on all values with a given key. Because the input data +(including the value collection) may be distributed across multiple workers, the combining function + might be called multiple times to perform partial combining on subsets of the value collection. + +Complex combination operations might require you to create a subclass of CombineFn that has an +accumulation type distinct from the input/output type. You should use CombineFn if the combine +function requires a more sophisticated accumulator, must perform additional pre- or post-processing, +might change the output type, or takes the key into account. + +**Kata:** Implement the average of numbers using +[Combine.CombineFn](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombineFn). +
Extend the CombineFn class that counts the average of the number.
+
Refer to the Beam Programming Guide "Advanced combinations using CombineFn" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Combine/CombineFn/tests.py b/learning/katas/python/Core Transforms/Combine/CombineFn/tests.py index 656e5b38abed6..9883983df90f2 100644 --- a/learning/katas/python/Core Transforms/Combine/CombineFn/tests.py +++ b/learning/katas/python/Core Transforms/Combine/CombineFn/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_combine_placeholders(): - placeholders = get_answer_placeholders() - placeholder = placeholders[1] - - if 'beam.CombineGlobally' in placeholder: - passed() - else: - failed('Use beam.CombineGlobally') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -42,6 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_combine_placeholders() test_output() diff --git a/learning/katas/python/Core Transforms/Combine/Simple Function/task-remote-info.yaml b/learning/katas/python/Core Transforms/Combine/Simple Function/task-remote-info.yaml index 9f6681d15dbf2..d61da2975c584 100644 --- a/learning/katas/python/Core Transforms/Combine/Simple Function/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Combine/Simple Function/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755584 -update_date: Fri, 07 Feb 2020 13:56:48 UTC +update_date: Tue, 19 May 2020 03:05:00 UTC diff --git a/learning/katas/python/Core Transforms/Combine/Simple Function/task.html b/learning/katas/python/Core Transforms/Combine/Simple Function/task.md similarity index 54% rename from learning/katas/python/Core Transforms/Combine/Simple Function/task.html rename to learning/katas/python/Core Transforms/Combine/Simple Function/task.md index 5e4bd02e2157b..47835894fe1cf 100644 --- a/learning/katas/python/Core Transforms/Combine/Simple Function/task.html +++ b/learning/katas/python/Core Transforms/Combine/Simple Function/task.md @@ -16,32 +16,27 @@ ~ limitations under the License. --> - -

Combine - Simple Function

-

- Combine is a Beam transform for combining collections of elements or values in your data. - When you apply a Combine transform, you must provide the function that contains the logic for - combining the elements or values. The combining function should be commutative and associative, - as the function is not necessarily invoked exactly once on all values with a given key. Because - the input data (including the value collection) may be distributed across multiple workers, the - combining function might be called multiple times to perform partial combining on subsets of - the value collection. -

-

- Simple combine operations, such as sums, can usually be implemented as a simple function. -

-

- Kata: Implement the summation of numbers using - - CombineGlobally. -

-
+Combine - Simple Function +------------------------- + +Combine is a Beam transform for combining collections of elements or values in your data. When you +apply a Combine transform, you must provide the function that contains the logic for combining the +elements or values. The combining function should be commutative and associative, as the function +is not necessarily invoked exactly once on all values with a given key. Because the input data +(including the value collection) may be distributed across multiple workers, the combining function + might be called multiple times to perform partial combining on subsets of the value collection. + +Simple combine operations, such as sums, can usually be implemented as a simple function. + +**Kata:** Implement the summation of numbers using +[CombineGlobally](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.CombineGlobally). +
Implement a simple Python function that performs the summation of the values.
+
Refer to the Beam Programming Guide "Simple combinations using simple functions" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Combine/Simple Function/tests.py b/learning/katas/python/Core Transforms/Combine/Simple Function/tests.py index 2d740d8400e64..a5465af2d106d 100644 --- a/learning/katas/python/Core Transforms/Combine/Simple Function/tests.py +++ b/learning/katas/python/Core Transforms/Combine/Simple Function/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_combine_placeholders(): - placeholders = get_answer_placeholders() - placeholder = placeholders[1] - - if 'beam.CombineGlobally' in placeholder: - passed() - else: - failed('Use beam.CombineGlobally') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -42,6 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_combine_placeholders() test_output() diff --git a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml index 85d30168d3c27..c0a5566689c03 100644 --- a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755593 -update_date: Fri, 07 Feb 2020 13:57:15 UTC +update_date: Tue, 19 May 2020 03:05:23 UTC diff --git a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.html b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.md similarity index 51% rename from learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.html rename to learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.md index 94c0e44e2ada2..b2d27ba89983e 100644 --- a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.html +++ b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.md @@ -16,34 +16,31 @@ ~ limitations under the License. --> - -

Composite Transform

-

- Transforms can have a nested structure, where a complex transform performs multiple simpler - transforms (such as more than one ParDo, Combine, GroupByKey, or even other composite transforms). - These transforms are called composite transforms. Nesting multiple transforms inside a single - composite transform can make your code more modular and easier to understand. -

-

- To create your own composite transform, create a subclass of the PTransform class and override - the expand method to specify the actual processing logic. You can then use this transform just as - you would a built-in transform from the Beam SDK. Within your PTransform subclass, you’ll need to - override the expand method. The expand method is where you add the processing logic for the - PTransform. Your override of expand must accept the appropriate type of input PCollection as a - parameter, and specify the output PCollection as the return value. -

-

- Kata: Please implement a composite transform "ExtractAndMultiplyNumbers" that extracts - numbers from comma separated line and then multiplies each number by 10. -

-
+Composite Transform +------------------- + +Transforms can have a nested structure, where a complex transform performs multiple simpler +transforms (such as more than one ParDo, Combine, GroupByKey, or even other composite transforms). +These transforms are called composite transforms. Nesting multiple transforms inside a single +composite transform can make your code more modular and easier to understand. + +To create your own composite transform, create a subclass of the PTransform class and override the +expand method to specify the actual processing logic. You can then use this transform just as you +would a built-in transform from the Beam SDK. Within your PTransform subclass, you’ll need to +override the expand method. The expand method is where you add the processing logic for the +PTransform. Your override of expand must accept the appropriate type of input PCollection as a +parameter, and specify the output PCollection as the return value. + +**Kata:** Please implement a composite transform "ExtractAndMultiplyNumbers" that extracts numbers +from comma separated line and then multiplies each number by 10. +
Refer to PTransform.
+
Refer to the Beam Programming Guide "Composite transforms" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/tests.py b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/tests.py index cc7db805461fe..f0fa900f8cbe9 100644 --- a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/tests.py +++ b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_composite_expand_method(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'def expand(' in placeholder: - passed() - else: - failed('Override "expand" method') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -42,6 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_composite_expand_method() test_output() diff --git a/learning/katas/python/Core Transforms/Flatten/Flatten/task-remote-info.yaml b/learning/katas/python/Core Transforms/Flatten/Flatten/task-remote-info.yaml index 634212c1544c0..a64890a7e3633 100644 --- a/learning/katas/python/Core Transforms/Flatten/Flatten/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Flatten/Flatten/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755588 -update_date: Fri, 07 Feb 2020 13:57:02 UTC +update_date: Tue, 19 May 2020 03:05:08 UTC diff --git a/learning/katas/python/Core Transforms/Flatten/Flatten/task.html b/learning/katas/python/Core Transforms/Flatten/Flatten/task.md similarity index 72% rename from learning/katas/python/Core Transforms/Flatten/Flatten/task.html rename to learning/katas/python/Core Transforms/Flatten/Flatten/task.md index 488c139b37e65..1d52b8609cd32 100644 --- a/learning/katas/python/Core Transforms/Flatten/Flatten/task.html +++ b/learning/katas/python/Core Transforms/Flatten/Flatten/task.md @@ -16,25 +16,23 @@ ~ limitations under the License. --> - -

Flatten

-

- Flatten is a Beam transform for PCollection objects that store the same data type. - Flatten merges multiple PCollection objects into a single logical PCollection. -

-

- Kata: Implement a - - Flatten transform that merges two PCollection of words into a single PCollection. -

-
+Flatten +------- + +Flatten is a Beam transform for PCollection objects that store the same data type. Flatten merges +multiple PCollection objects into a single logical PCollection. + +**Kata:** Implement a +[Flatten](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Flatten) +transform that merges two PCollection of words into a single PCollection. +
Refer to Flatten to solve this problem.
+
Refer to the Beam Programming Guide "Flatten" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Flatten/Flatten/tests.py b/learning/katas/python/Core Transforms/Flatten/Flatten/tests.py index c2caa2ee87db1..db32bc65ee974 100644 --- a/learning/katas/python/Core Transforms/Flatten/Flatten/tests.py +++ b/learning/katas/python/Core Transforms/Flatten/Flatten/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_flatten(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'beam.Flatten' in placeholder: - passed() - else: - failed('Use beam.Flatten') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -42,6 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_flatten() test_output() diff --git a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml index 23d7d408b2188..6a23287105cf2 100644 --- a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755582 -update_date: Fri, 07 Feb 2020 13:56:41 UTC +update_date: Tue, 19 May 2020 03:04:53 UTC diff --git a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.html b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.md similarity index 61% rename from learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.html rename to learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.md index 042912afb48f6..fb32244fff6a2 100644 --- a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.html +++ b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.md @@ -16,29 +16,27 @@ ~ limitations under the License. --> - -

GroupByKey

-

- GroupByKey is a Beam transform for processing collections of key/value pairs. It’s a parallel - reduction operation, analogous to the Shuffle phase of a Map/Shuffle/Reduce-style algorithm. The - input to GroupByKey is a collection of key/value pairs that represents a multimap, where the - collection contains multiple pairs that have the same key, but different values. Given such a - collection, you use GroupByKey to collect all of the values associated with each unique key. -

-

- Kata: Implement a - - GroupByKey transform that groups words by its first letter. -

-
+GroupByKey +---------- + +GroupByKey is a Beam transform for processing collections of key/value pairs. It’s a parallel +reduction operation, analogous to the Shuffle phase of a Map/Shuffle/Reduce-style algorithm. +The input to GroupByKey is a collection of key/value pairs that represents a multimap, where the +collection contains multiple pairs that have the same key, but different values. Given such a +collection, you use GroupByKey to collect all of the values associated with each unique key. + +**Kata:** Implement a +[GroupByKey](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.GroupByKey) +transform that groups words by its first letter. +
Refer to GroupByKey to solve this problem.
+
Refer to the Beam Programming Guide "GroupByKey" section for more information.
- diff --git a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/tests.py b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/tests.py index 8f9ffd57f98d0..e16fb6cb48832 100644 --- a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/tests.py +++ b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_is_not_empty, test_answer_placeholders_text_deleted +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -33,5 +32,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/Core Transforms/Map/FlatMap/task-remote-info.yaml b/learning/katas/python/Core Transforms/Map/FlatMap/task-remote-info.yaml index 4911596641ba3..f98961e93e7b5 100644 --- a/learning/katas/python/Core Transforms/Map/FlatMap/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Map/FlatMap/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755580 -update_date: Fri, 07 Feb 2020 13:56:38 UTC +update_date: Tue, 19 May 2020 03:04:50 UTC diff --git a/learning/katas/python/Core Transforms/Map/FlatMap/task.html b/learning/katas/python/Core Transforms/Map/FlatMap/task.md similarity index 70% rename from learning/katas/python/Core Transforms/Map/FlatMap/task.html rename to learning/katas/python/Core Transforms/Map/FlatMap/task.md index f69fffd1f3097..7c6aadec400a6 100644 --- a/learning/katas/python/Core Transforms/Map/FlatMap/task.html +++ b/learning/katas/python/Core Transforms/Map/FlatMap/task.md @@ -16,28 +16,24 @@ ~ limitations under the License. --> - -

FlatMapElements

-

- The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. -

-

- FlatMap can be used to simplify DoFn that maps an element to multiple elements (one to many). -

-

- Kata: Implement a function that maps each input sentence into words tokenized by whitespace - (" ") using - - FlatMap. -

-
+FlatMapElements +--------------- + +The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. + +FlatMap can be used to simplify DoFn that maps an element to multiple elements (one to many). + +**Kata:** Implement a function that maps each input sentence into words tokenized by +whitespace (" ") using +[FlatMap](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.FlatMap). +
Use FlatMap with a lambda.
+
Refer to the Beam Programming Guide "Lightweight DoFns and other abstractions" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Map/FlatMap/tests.py b/learning/katas/python/Core Transforms/Map/FlatMap/tests.py index 6eaaa643e9388..e166eae902a1b 100644 --- a/learning/katas/python/Core Transforms/Map/FlatMap/tests.py +++ b/learning/katas/python/Core Transforms/Map/FlatMap/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_flatmap(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'beam.FlatMap' in placeholder: - passed() - else: - failed('Use beam.FlatMap') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -42,6 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_flatmap() test_output() diff --git a/learning/katas/python/Core Transforms/Map/Map/task-remote-info.yaml b/learning/katas/python/Core Transforms/Map/Map/task-remote-info.yaml index a9505b797650c..66446ef58a854 100644 --- a/learning/katas/python/Core Transforms/Map/Map/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Map/Map/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755579 -update_date: Fri, 07 Feb 2020 13:56:35 UTC +update_date: Tue, 19 May 2020 03:04:48 UTC diff --git a/learning/katas/python/Core Transforms/Map/Map/task.html b/learning/katas/python/Core Transforms/Map/Map/task.md similarity index 76% rename from learning/katas/python/Core Transforms/Map/Map/task.html rename to learning/katas/python/Core Transforms/Map/Map/task.md index fee1a4bc1d060..46694d948e22f 100644 --- a/learning/katas/python/Core Transforms/Map/Map/task.html +++ b/learning/katas/python/Core Transforms/Map/Map/task.md @@ -16,24 +16,21 @@ ~ limitations under the License. --> - -

MapElements

-

- The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. -

-

- Kata: Implement a simple map function that multiplies all input elements by 5 using - - Map. -

-
+MapElements +----------- + +The Beam SDKs provide language-specific ways to simplify how you provide your DoFn implementation. + +**Kata:** Implement a simple map function that multiplies all input elements by 5 using +[Map](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Map). +
Use Map with a lambda.
+
Refer to the Beam Programming Guide "Lightweight DoFns and other abstractions" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Map/Map/tests.py b/learning/katas/python/Core Transforms/Map/Map/tests.py index 52789eabdde19..2380fe438df70 100644 --- a/learning/katas/python/Core Transforms/Map/Map/tests.py +++ b/learning/katas/python/Core Transforms/Map/Map/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_map(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'beam.Map' in placeholder: - passed() - else: - failed('Use beam.Map') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -42,6 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_map() test_output() diff --git a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml index 33a41e96f1db0..e0c2b03a1f851 100644 --- a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755578 -update_date: Fri, 07 Feb 2020 13:56:32 UTC +update_date: Tue, 19 May 2020 03:04:45 UTC diff --git a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.html b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.md similarity index 90% rename from learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.html rename to learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.md index 5ff36d0e0e434..3f42193e22d2b 100644 --- a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.html +++ b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.md @@ -16,28 +16,28 @@ ~ limitations under the License. --> - -

ParDo OneToMany

-

- Kata: Please write a ParDo that maps each input sentence into words tokenized by - whitespace (" "). -

-
+ParDo OneToMany +--------------- + +**Kata:** Please write a ParDo that maps each input sentence into words tokenized by +whitespace (" "). +
Override process method. You can return an Iterable for multiple elements or call "yield" for each element to return a generator.
+
Use ParDo with DoFn.
+
Refer to the Beam Programming Guide "ParDo" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/tests.py b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/tests.py index b934821e35660..c83a7dd7b0cb2 100644 --- a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/tests.py +++ b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/tests.py @@ -14,29 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_dofn_process_method(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'def process(self, element' in placeholder: - passed() - else: - failed('Override "process" method') - - -def test_pardo(): - placeholders = get_answer_placeholders() - placeholder = placeholders[1] - - if 'beam.ParDo(BreakIntoWordsDoFn())' in placeholder: - passed() - else: - failed('Use beam.ParDo') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -52,7 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_dofn_process_method() - test_pardo() test_output() diff --git a/learning/katas/python/Core Transforms/Map/ParDo/task-remote-info.yaml b/learning/katas/python/Core Transforms/Map/ParDo/task-remote-info.yaml index 8a43bcd9ebc69..97b55d7821e72 100644 --- a/learning/katas/python/Core Transforms/Map/ParDo/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Map/ParDo/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755577 -update_date: Fri, 07 Feb 2020 13:56:29 UTC +update_date: Tue, 19 May 2020 03:04:42 UTC diff --git a/learning/katas/python/Core Transforms/Map/ParDo/task.html b/learning/katas/python/Core Transforms/Map/ParDo/task.md similarity index 74% rename from learning/katas/python/Core Transforms/Map/ParDo/task.html rename to learning/katas/python/Core Transforms/Map/ParDo/task.md index e6eab7bbd5fba..bbb52a54f52b9 100644 --- a/learning/katas/python/Core Transforms/Map/ParDo/task.html +++ b/learning/katas/python/Core Transforms/Map/ParDo/task.md @@ -16,30 +16,29 @@ ~ limitations under the License. --> - -

ParDo

-

- ParDo is a Beam transform for generic parallel processing. The ParDo processing paradigm is - similar to the “Map” phase of a Map/Shuffle/Reduce-style algorithm: a ParDo transform considers - each element in the input PCollection, performs some processing function (your user code) on - that element, and emits zero, one, or multiple elements to an output PCollection. -

-

- Kata: Please write a simple ParDo that maps the input element by multiplying it by 10. -

-
+ParDo +----- + +ParDo is a Beam transform for generic parallel processing. The ParDo processing paradigm is similar +to the “Map” phase of a Map/Shuffle/Reduce-style algorithm: a ParDo transform considers each element +in the input PCollection, performs some processing function (your user code) on that element, and +emits zero, one, or multiple elements to an output PCollection. + +**Kata:** Please write a simple ParDo that maps the input element by multiplying it by 10. +
Override process method.
+
Use ParDo with DoFn.
+
Refer to the Beam Programming Guide "ParDo" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Map/ParDo/tests.py b/learning/katas/python/Core Transforms/Map/ParDo/tests.py index 55913188a937b..a274e68cd3534 100644 --- a/learning/katas/python/Core Transforms/Map/ParDo/tests.py +++ b/learning/katas/python/Core Transforms/Map/ParDo/tests.py @@ -14,29 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_dofn_process_method(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'def process(self, element' in placeholder: - passed() - else: - failed('Override "process" method') - - -def test_pardo(): - placeholders = get_answer_placeholders() - placeholder = placeholders[1] - - if 'beam.ParDo(MultiplyByTenDoFn())' in placeholder: - passed() - else: - failed('Use beam.ParDo') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -52,7 +30,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_dofn_process_method() - test_pardo() test_output() diff --git a/learning/katas/python/Core Transforms/Partition/Partition/task-remote-info.yaml b/learning/katas/python/Core Transforms/Partition/Partition/task-remote-info.yaml index 948872272133b..3a551d6f8fa12 100644 --- a/learning/katas/python/Core Transforms/Partition/Partition/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Partition/Partition/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755589 -update_date: Fri, 07 Feb 2020 13:59:37 UTC +update_date: Tue, 19 May 2020 03:05:12 UTC diff --git a/learning/katas/python/Core Transforms/Partition/Partition/task.html b/learning/katas/python/Core Transforms/Partition/Partition/task.md similarity index 58% rename from learning/katas/python/Core Transforms/Partition/Partition/task.html rename to learning/katas/python/Core Transforms/Partition/Partition/task.md index 513fd3a82267b..62a244d4752dc 100644 --- a/learning/katas/python/Core Transforms/Partition/Partition/task.html +++ b/learning/katas/python/Core Transforms/Partition/Partition/task.md @@ -16,32 +16,28 @@ ~ limitations under the License. --> - -

Partition

-

- Partition is a Beam transform for PCollection objects that store the same data type. - Partition splits a single PCollection into a fixed number of smaller collections. -

-

- Partition divides the elements of a PCollection according to a partitioning function - that you provide. The partitioning function contains the logic that determines how to split up - the elements of the input PCollection into each resulting partition PCollection. -

-

- Kata: Implement a - - Partition transform that splits a PCollection of numbers into two PCollections. - The first PCollection contains numbers greater than 100, and the second PCollection contains - the remaining numbers. -

-
+Partition +--------- + +Partition is a Beam transform for PCollection objects that store the same data type. Partition +splits a single PCollection into a fixed number of smaller collections. + +Partition divides the elements of a PCollection according to a partitioning function that you +provide. The partitioning function contains the logic that determines how to split up the elements +of the input PCollection into each resulting partition PCollection. + +**Kata:** Implement a +[Partition](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.Partition) +transform that splits a PCollection of numbers into two PCollections. The first PCollection +contains numbers greater than 100, and the second PCollection contains the remaining numbers. +
Refer to Partition to solve this problem.
+
Refer to the Beam Programming Guide "Partition" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Partition/Partition/tests.py b/learning/katas/python/Core Transforms/Partition/Partition/tests.py index bbeeaf7c196e8..d8285aede76f6 100644 --- a/learning/katas/python/Core Transforms/Partition/Partition/tests.py +++ b/learning/katas/python/Core Transforms/Partition/Partition/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_partition(): - placeholders = get_answer_placeholders() - placeholder = placeholders[1] - - if 'beam.Partition' in placeholder: - passed() - else: - failed('Use beam.Partition') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -51,6 +39,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_partition() test_output() diff --git a/learning/katas/python/Core Transforms/Side Input/Side Input/task-remote-info.yaml b/learning/katas/python/Core Transforms/Side Input/Side Input/task-remote-info.yaml index 338c410d5074f..4957ecec404c1 100644 --- a/learning/katas/python/Core Transforms/Side Input/Side Input/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Side Input/Side Input/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755590 -update_date: Fri, 07 Feb 2020 13:57:06 UTC +update_date: Tue, 19 May 2020 03:05:14 UTC diff --git a/learning/katas/python/Core Transforms/Side Input/Side Input/task.html b/learning/katas/python/Core Transforms/Side Input/Side Input/task.md similarity index 64% rename from learning/katas/python/Core Transforms/Side Input/Side Input/task.html rename to learning/katas/python/Core Transforms/Side Input/Side Input/task.md index b9136274c102f..5d67a0beae287 100644 --- a/learning/katas/python/Core Transforms/Side Input/Side Input/task.html +++ b/learning/katas/python/Core Transforms/Side Input/Side Input/task.md @@ -16,38 +16,36 @@ ~ limitations under the License. --> - -

Side Input

-

- In addition to the main input PCollection, you can provide additional inputs to a ParDo transform - in the form of side inputs. A side input is an additional input that your DoFn can access each - time it processes an element in the input PCollection. When you specify a side input, you create - a view of some other data that can be read from within the ParDo transform’s DoFn while - processing each element. -

-

- Side inputs are useful if your ParDo needs to inject additional data when processing each element - in the input PCollection, but the additional data needs to be determined at runtime (and not - hard-coded). Such values might be determined by the input data, or depend on a different branch - of your pipeline. -

-

- Kata: Please enrich each Person with the country based on the city he/she lives in. -

-
+Side Input +---------- + +In addition to the main input PCollection, you can provide additional inputs to a ParDo transform +in the form of side inputs. A side input is an additional input that your DoFn can access each time +it processes an element in the input PCollection. When you specify a side input, you create a view +of some other data that can be read from within the ParDo transform’s DoFn while processing each +element. + +Side inputs are useful if your ParDo needs to inject additional data when processing each element +in the input PCollection, but the additional data needs to be determined at runtime (and not +hard-coded). Such values might be determined by the input data, or depend on a different branch of +your pipeline. + +**Kata:** Please enrich each Person with the country based on the city he/she lives in. +
Override process method that also accepts side input argument.
+
Use ParDo with DoFn that accepts side input.
+
Refer to the Beam Programming Guide "Side inputs" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Side Input/Side Input/tests.py b/learning/katas/python/Core Transforms/Side Input/Side Input/tests.py index 8fdd7da55e207..6171323e0bbb7 100644 --- a/learning/katas/python/Core Transforms/Side Input/Side Input/tests.py +++ b/learning/katas/python/Core Transforms/Side Input/Side Input/tests.py @@ -22,29 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_dofn_process_method(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'def process(self, element' in placeholder: - passed() - else: - failed('Override "process" method') - - -def test_pardo(): - placeholders = get_answer_placeholders() - placeholder = placeholders[1] - - if 'beam.ParDo(EnrichCountryDoFn(),' in placeholder: - passed() - else: - failed('Use beam.ParDo that accepts side input') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -66,7 +44,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_dofn_process_method() - test_pardo() test_output() diff --git a/learning/katas/python/Core Transforms/Side Output/Side Output/task-remote-info.yaml b/learning/katas/python/Core Transforms/Side Output/Side Output/task-remote-info.yaml index 74de155ae7cc6..158110e2e711f 100644 --- a/learning/katas/python/Core Transforms/Side Output/Side Output/task-remote-info.yaml +++ b/learning/katas/python/Core Transforms/Side Output/Side Output/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755591 -update_date: Fri, 07 Feb 2020 13:57:09 UTC +update_date: Tue, 19 May 2020 03:05:17 UTC diff --git a/learning/katas/python/Core Transforms/Side Output/Side Output/task.html b/learning/katas/python/Core Transforms/Side Output/Side Output/task.md similarity index 77% rename from learning/katas/python/Core Transforms/Side Output/Side Output/task.html rename to learning/katas/python/Core Transforms/Side Output/Side Output/task.md index b6e05431ea10b..5f9ee5d781f8d 100644 --- a/learning/katas/python/Core Transforms/Side Output/Side Output/task.html +++ b/learning/katas/python/Core Transforms/Side Output/Side Output/task.md @@ -16,18 +16,16 @@ ~ limitations under the License. --> - -

Side Output

-

- While ParDo always produces a main output PCollection (as the return value from apply), you can - also have your ParDo produce any number of additional output PCollections. If you choose to have - multiple outputs, your ParDo returns all of the output PCollections (including the main output) - bundled together. -

-

- Kata: Implement additional output to your ParDo for numbers bigger than 100. -

-
+Side Output +----------- + +While ParDo always produces a main output PCollection (as the return value from apply), you can +also have your ParDo produce any number of additional output PCollections. If you choose to have +multiple outputs, your ParDo returns all of the output PCollections (including the main output) +bundled together. + +**Kata:** Implement additional output to your ParDo for numbers bigger than 100. +
Use pvalue.TaggedOutput and @@ -36,9 +34,9 @@

Side Output

ParDo.
+
Refer to the Beam Programming Guide "Additional outputs" section for more information.
- diff --git a/learning/katas/python/Core Transforms/Side Output/Side Output/tests.py b/learning/katas/python/Core Transforms/Side Output/Side Output/tests.py index 1af84398a91fe..89b299b40744e 100644 --- a/learning/katas/python/Core Transforms/Side Output/Side Output/tests.py +++ b/learning/katas/python/Core Transforms/Side Output/Side Output/tests.py @@ -14,29 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_dofn_process_method(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'pvalue.TaggedOutput' in placeholder: - passed() - else: - failed('Use pvalue.TaggedOutput') - - -def test_pardo(): - placeholders = get_answer_placeholders() - placeholder = placeholders[1] - - if all(['beam.ParDo(ProcessNumbersDoFn())', '.with_outputs,']) in placeholder: - passed() - else: - failed('Use beam.ParDo that outputs multiple outputs') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -61,7 +39,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_dofn_process_method() - test_pardo() test_output() diff --git a/learning/katas/python/Examples/Word Count/Word Count/task-remote-info.yaml b/learning/katas/python/Examples/Word Count/Word Count/task-remote-info.yaml index b1f7f2c9d8a9d..eec4604c1c247 100644 --- a/learning/katas/python/Examples/Word Count/Word Count/task-remote-info.yaml +++ b/learning/katas/python/Examples/Word Count/Word Count/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755604 -update_date: Fri, 07 Feb 2020 13:57:43 UTC +update_date: Tue, 19 May 2020 03:06:04 UTC diff --git a/learning/katas/python/Examples/Word Count/Word Count/task.html b/learning/katas/python/Examples/Word Count/Word Count/task.md similarity index 84% rename from learning/katas/python/Examples/Word Count/Word Count/task.html rename to learning/katas/python/Examples/Word Count/Word Count/task.md index 82ce81cbf8f35..b1bb44c2ced90 100644 --- a/learning/katas/python/Examples/Word Count/Word Count/task.html +++ b/learning/katas/python/Examples/Word Count/Word Count/task.md @@ -16,25 +16,23 @@ ~ limitations under the License. --> - -

Word Count Pipeline

-

- Kata: Create a pipeline that counts the number of words. -

-

- Please output the count of each word in the following format: -

-
+Word Count Pipeline
+-------------------
+
+**Kata:** Create a pipeline that counts the number of words.
+
+Please output the count of each word in the following format:
+```text
   word:count
   ball:5
   book:3
-
-
+``` +
Refer to your katas above.
+
Use MapTuple to unpack key-value pair into different function arguments.
- diff --git a/learning/katas/python/Examples/Word Count/Word Count/tests.py b/learning/katas/python/Examples/Word Count/Word Count/tests.py index 16b7bf511c0ae..50a2bdcc3e614 100644 --- a/learning/katas/python/Examples/Word Count/Word Count/tests.py +++ b/learning/katas/python/Examples/Word Count/Word Count/tests.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, get_file_output, \ - test_is_not_empty, test_answer_placeholders_text_deleted +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -37,5 +36,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() test_output() diff --git a/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml index 3dfb10425346d..a5130d4e01378 100644 --- a/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml +++ b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1076138 -update_date: Fri, 07 Feb 2020 13:56:24 UTC +update_date: Tue, 19 May 2020 03:05:56 UTC diff --git a/learning/katas/java/IO/Built-in IOs/Built-in IOs/task.html b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task.md similarity index 63% rename from learning/katas/java/IO/Built-in IOs/Built-in IOs/task.html rename to learning/katas/python/IO/Built-in IOs/Built-in IOs/task.md index 447dfa3141cff..df4df796ebce2 100644 --- a/learning/katas/java/IO/Built-in IOs/Built-in IOs/task.html +++ b/learning/katas/python/IO/Built-in IOs/Built-in IOs/task.md @@ -16,18 +16,13 @@ ~ limitations under the License. --> - -

Built-in I/Os

-

- Beam SDKs provide many out of the box I/O transforms that can be used to read from many - different sources and write to many different sinks. -

-

- See the Beam-provided I/O - Transforms page for a list of the currently available I/O transforms. -

-

- Note: There is no kata for this task. Please click the "Check" button and - proceed to the next task. -

- \ No newline at end of file +Built-in I/Os +------------- + +Beam SDKs provide many out of the box I/O transforms that can be used to read from many different +sources and write to many different sinks. + +See the [Beam-provided I/O Transforms](https://beam.apache.org/documentation/io/built-in/) page +for a list of the currently available I/O transforms. + +**Note:** There is no kata for this task. Please proceed to the next task. diff --git a/learning/katas/python/IO/TextIO/ReadFromText/task-remote-info.yaml b/learning/katas/python/IO/TextIO/ReadFromText/task-remote-info.yaml index a6a6ee6702f68..9afea636a85e9 100644 --- a/learning/katas/python/IO/TextIO/ReadFromText/task-remote-info.yaml +++ b/learning/katas/python/IO/TextIO/ReadFromText/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755602 -update_date: Fri, 07 Feb 2020 13:57:40 UTC +update_date: Tue, 19 May 2020 03:05:52 UTC diff --git a/learning/katas/python/IO/TextIO/ReadFromText/task.html b/learning/katas/python/IO/TextIO/ReadFromText/task.md similarity index 60% rename from learning/katas/python/IO/TextIO/ReadFromText/task.html rename to learning/katas/python/IO/TextIO/ReadFromText/task.md index c4fc0bde6967a..4f3a7cfc5842f 100644 --- a/learning/katas/python/IO/TextIO/ReadFromText/task.html +++ b/learning/katas/python/IO/TextIO/ReadFromText/task.md @@ -16,30 +16,27 @@ ~ limitations under the License. --> - -

ReadFromText

-

- When you create a pipeline, you often need to read data from some external source, such as a file - or a database. Likewise, you may want your pipeline to output its result data to an external - storage system. Beam provides read and write transforms for a number of common data storage types. - If you want your pipeline to read from or write to a data storage format that isn’t supported by - the built-in transforms, you can implement your own read and write transforms. -

-

- To read a PCollection from one or more text files, use beam.io.ReadFromText to instantiate a - transform and specify the path of the file(s) to be read. -

-

- Kata: Read the 'countries.txt' file and convert each country name into uppercase. -

-
+ReadFromText +------------ + +When you create a pipeline, you often need to read data from some external source, such as a file +or a database. Likewise, you may want your pipeline to output its result data to an external +storage system. Beam provides read and write transforms for a number of common data storage types. +If you want your pipeline to read from or write to a data storage format that isn’t supported by +the built-in transforms, you can implement your own read and write transforms. + +To read a PCollection from one or more text files, use beam.io.ReadFromText to instantiate a +transform and specify the path of the file(s) to be read. + +**Kata:** Read the 'countries.txt' file and convert each country name into uppercase. +
Use beam.io.ReadFromText.
+
Refer to the Beam Programming Guide "Reading input data" section for more information.
- diff --git a/learning/katas/python/IO/TextIO/ReadFromText/tests.py b/learning/katas/python/IO/TextIO/ReadFromText/tests.py index 273aada623974..5a29e43d8797f 100644 --- a/learning/katas/python/IO/TextIO/ReadFromText/tests.py +++ b/learning/katas/python/IO/TextIO/ReadFromText/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_readfromtext_method(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'ReadFromText(' in placeholder: - passed() - else: - failed('Use beam.io.ReadFromText') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -53,6 +41,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_readfromtext_method() test_output() diff --git a/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml index 800507e89f534..d4953da14b5a6 100644 --- a/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml +++ b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 755575 -update_date: Fri, 07 Feb 2020 13:56:26 UTC +update_date: Tue, 19 May 2020 03:04:39 UTC diff --git a/learning/katas/python/Introduction/Hello Beam/Hello Beam/task.html b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task.md similarity index 51% rename from learning/katas/python/Introduction/Hello Beam/Hello Beam/task.html rename to learning/katas/python/Introduction/Hello Beam/Hello Beam/task.md index e71982d045f18..b6df12c28d862 100644 --- a/learning/katas/python/Introduction/Hello Beam/Hello Beam/task.html +++ b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task.md @@ -16,38 +16,34 @@ ~ limitations under the License. --> - -

Hello Beam Pipeline

-

- Apache Beam is an open source, unified model for defining both batch and streaming data-parallel - processing pipelines. Using one of the open source Beam SDKs, you build a program that defines the - pipeline. The pipeline is then executed by one of Beam’s supported distributed processing - back-ends, which include Apache Apex, Apache Flink, Apache Spark, and Google Cloud Dataflow. -

-

- Beam is particularly useful for Embarrassingly Parallel data processing tasks, in which the - problem can be decomposed into many smaller bundles of data that can be processed independently - and in parallel. You can also use Beam for Extract, Transform, and Load (ETL) tasks and pure data - integration. These tasks are useful for moving data between different storage media and data - sources, transforming data into a more desirable format, or loading data onto a new system. -

-

- To learn more about Apache Beam, refer to - Apache Beam Overview. -

-

- Kata: Your first kata is to create a simple pipeline that takes a hardcoded input element - "Hello Beam". -

-
+Welcome To Apache Beam +---------------------- + +Apache Beam is an open source, unified model for defining both batch and streaming data-parallel +processing pipelines. Using one of the open source Beam SDKs, you build a program that defines the +pipeline. The pipeline is then executed by one of Beam’s supported distributed processing back-ends, +which include Apache Apex, Apache Flink, Apache Spark, and Google Cloud Dataflow. + +Beam is particularly useful for Embarrassingly Parallel data processing tasks, in which the problem +can be decomposed into many smaller bundles of data that can be processed independently and in +parallel. You can also use Beam for Extract, Transform, and Load (ETL) tasks and pure data +integration. These tasks are useful for moving data between different storage media and data +sources, transforming data into a more desirable format, or loading data onto a new system. + +To learn more about Apache Beam, refer to +[Apache Beam Overview](https://beam.apache.org/get-started/beam-overview/). + +**Kata:** Your first kata is to create a simple pipeline that takes a hardcoded input element +"Hello Beam". +
Hardcoded input can be created using Create.
+
Refer to the Beam Programming Guide "Creating a PCollection from in-memory data" section for more information.
- diff --git a/learning/katas/python/Introduction/Hello Beam/Hello Beam/tests.py b/learning/katas/python/Introduction/Hello Beam/Hello Beam/tests.py index 33d45a642c03a..d0e9098678581 100644 --- a/learning/katas/python/Introduction/Hello Beam/Hello Beam/tests.py +++ b/learning/katas/python/Introduction/Hello Beam/Hello Beam/tests.py @@ -14,18 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_answer_placeholders(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - if 'beam.Create' in placeholder: - passed() - else: - failed('Use beam.Create') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -39,6 +28,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_answer_placeholders() test_output() diff --git a/learning/katas/python/Windowing/Adding Timestamp/ParDo/task-info.yaml b/learning/katas/python/Windowing/Adding Timestamp/ParDo/task-info.yaml index be661cb1758a4..bbdc8d0177958 100644 --- a/learning/katas/python/Windowing/Adding Timestamp/ParDo/task-info.yaml +++ b/learning/katas/python/Windowing/Adding Timestamp/ParDo/task-info.yaml @@ -22,10 +22,10 @@ files: - name: task.py visible: true placeholders: - - offset: 1211 - length: 163 + - offset: 1231 + length: 155 placeholder_text: TODO() - - offset: 1740 + - offset: 1917 length: 30 placeholder_text: TODO() - name: tests.py diff --git a/learning/katas/python/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml b/learning/katas/python/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml index d65ccb3f218e1..3eafb58a76137 100644 --- a/learning/katas/python/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml +++ b/learning/katas/python/Windowing/Adding Timestamp/ParDo/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1124219 -update_date: Mon, 09 Mar 2020 14:33:58 UTC +update_date: Tue, 19 May 2020 03:06:43 UTC diff --git a/learning/katas/python/Windowing/Adding Timestamp/ParDo/task.html b/learning/katas/python/Windowing/Adding Timestamp/ParDo/task.md similarity index 76% rename from learning/katas/python/Windowing/Adding Timestamp/ParDo/task.html rename to learning/katas/python/Windowing/Adding Timestamp/ParDo/task.md index 2e93a0c9bb215..f664a7b2306bc 100644 --- a/learning/katas/python/Windowing/Adding Timestamp/ParDo/task.html +++ b/learning/katas/python/Windowing/Adding Timestamp/ParDo/task.md @@ -16,32 +16,30 @@ ~ limitations under the License. --> - -

Adding Timestamp - ParDo

-

- Bounded sources (such as a file from TextIO) do not provide timestamps for elements. If you need - timestamps, you must add them to your PCollection’s elements. -

-

- You can assign new timestamps to the elements of a PCollection by applying a ParDo transform that - outputs new elements with timestamps that you set. -

-

- Kata: Please assign each element a timestamp based on the the Event.date. -

-
+Adding Timestamp - ParDo +------------------------ + +Bounded sources (such as a file from TextIO) do not provide timestamps for elements. If you need +timestamps, you must add them to your PCollection’s elements. + +You can assign new timestamps to the elements of a PCollection by applying a ParDo transform that +outputs new elements with timestamps that you set. + +**Kata:** Please assign each element a timestamp based on the the `Event.timestamp`. +
Use ParDo with DoFn.
+
Use beam.window.TimestampedValue to assign timestamp to the element.
+
Refer to the Beam Programming Guide "Adding timestamps to a PCollection’s elements" section for more information.
- diff --git a/learning/katas/python/Windowing/Adding Timestamp/ParDo/task.py b/learning/katas/python/Windowing/Adding Timestamp/ParDo/task.py index f2e6ce6e7460c..aba4f6eedf313 100644 --- a/learning/katas/python/Windowing/Adding Timestamp/ParDo/task.py +++ b/learning/katas/python/Windowing/Adding Timestamp/ParDo/task.py @@ -15,7 +15,7 @@ # limitations under the License. import datetime -import time +import pytz import apache_beam as beam from apache_beam.transforms import window @@ -24,30 +24,30 @@ class Event: - def __init__(self, id, event, date): + def __init__(self, id, event, timestamp): self.id = id self.event = event - self.date = date + self.timestamp = timestamp def __str__(self) -> str: - return f'Event({self.id}, {self.event}, {self.date})' + return f'Event({self.id}, {self.event}, {self.timestamp})' class AddTimestampDoFn(beam.DoFn): def process(self, element, **kwargs): - unix_timestamp = time.mktime(element.date.timetuple()) + unix_timestamp = element.timestamp.timestamp() yield window.TimestampedValue(element, unix_timestamp) p = beam.Pipeline() (p | beam.Create([ - Event('1', 'book-order', datetime.date(2020, 3, 4)), - Event('2', 'pencil-order', datetime.date(2020, 3, 5)), - Event('3', 'paper-order', datetime.date(2020, 3, 6)), - Event('4', 'pencil-order', datetime.date(2020, 3, 7)), - Event('5', 'book-order', datetime.date(2020, 3, 8)), + Event('1', 'book-order', datetime.datetime(2020, 3, 4, 0, 0, 0, 0, tzinfo=pytz.UTC)), + Event('2', 'pencil-order', datetime.datetime(2020, 3, 5, 0, 0, 0, 0, tzinfo=pytz.UTC)), + Event('3', 'paper-order', datetime.datetime(2020, 3, 6, 0, 0, 0, 0, tzinfo=pytz.UTC)), + Event('4', 'pencil-order', datetime.datetime(2020, 3, 7, 0, 0, 0, 0, tzinfo=pytz.UTC)), + Event('5', 'book-order', datetime.datetime(2020, 3, 8, 0, 0, 0, 0, tzinfo=pytz.UTC)), ]) | beam.ParDo(AddTimestampDoFn()) | LogElements(with_timestamp=True)) diff --git a/learning/katas/python/Windowing/Adding Timestamp/ParDo/tests.py b/learning/katas/python/Windowing/Adding Timestamp/ParDo/tests.py index 1db5ba142f2b7..2a2011f707c4a 100644 --- a/learning/katas/python/Windowing/Adding Timestamp/ParDo/tests.py +++ b/learning/katas/python/Windowing/Adding Timestamp/ParDo/tests.py @@ -14,40 +14,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_dofn_process_method(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'def process(self,' in placeholder: - passed() - else: - failed('Override "process" method') - - -def test_pardo(): - placeholders = get_answer_placeholders() - placeholder = placeholders[1] - - if 'beam.ParDo(AddTimestampDoFn())' in placeholder: - passed() - else: - failed('Use beam.ParDo') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): output = get_file_output() answers = [ - "Event(1, book-order, 2020-03-04), timestamp='2020-03-03T16:00:00Z'", - "Event(2, pencil-order, 2020-03-05), timestamp='2020-03-04T16:00:00Z'", - "Event(3, paper-order, 2020-03-06), timestamp='2020-03-05T16:00:00Z'", - "Event(4, pencil-order, 2020-03-07), timestamp='2020-03-06T16:00:00Z'", - "Event(5, book-order, 2020-03-08), timestamp='2020-03-07T16:00:00Z'" + "Event(1, book-order, 2020-03-04 00:00:00+00:00), timestamp='2020-03-04T00:00:00Z'", + "Event(2, pencil-order, 2020-03-05 00:00:00+00:00), timestamp='2020-03-05T00:00:00Z'", + "Event(3, paper-order, 2020-03-06 00:00:00+00:00), timestamp='2020-03-06T00:00:00Z'", + "Event(4, pencil-order, 2020-03-07 00:00:00+00:00), timestamp='2020-03-07T00:00:00Z'", + "Event(5, book-order, 2020-03-08 00:00:00+00:00), timestamp='2020-03-08T00:00:00Z'" ] if all(line in output for line in answers): @@ -58,7 +36,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_dofn_process_method() - test_pardo() test_output() diff --git a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task-info.yaml b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task-info.yaml index 289777f8f57c4..5c91b23e68436 100644 --- a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task-info.yaml +++ b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task-info.yaml @@ -22,7 +22,7 @@ files: - name: task.py visible: true placeholders: - - offset: 2074 + - offset: 2067 length: 85 placeholder_text: TODO() - name: tests.py diff --git a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml index c826ee1d0de75..70623269a5d5b 100644 --- a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml +++ b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task-remote-info.yaml @@ -1,2 +1,2 @@ id: 1124220 -update_date: Mon, 09 Mar 2020 14:34:10 UTC +update_date: Tue, 19 May 2020 03:06:01 UTC diff --git a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.html b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.html deleted file mode 100644 index 2709b677aef2b..0000000000000 --- a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.html +++ /dev/null @@ -1,61 +0,0 @@ - - - -

Fixed Time Window

-

- Windowing subdivides a PCollection according to the timestamps of its individual elements. - Transforms that aggregate multiple elements, such as GroupByKey and Combine, work implicitly on - a per-window basis — they process each PCollection as a succession of multiple, finite windows, - though the entire collection itself may be of unbounded size. -

-

- In the Beam model, any PCollection (including unbounded PCollections) can be subdivided into - logical windows. Each element in a PCollection is assigned to one or more windows according to - the PCollection’s windowing function, and each individual window contains a finite number of - elements. Grouping transforms then consider each PCollection’s elements on a per-window basis. - GroupByKey, for example, implicitly groups the elements of a PCollection by key and window. -

-
- Beam provides several windowing functions, including: - -
-

- The simplest form of windowing is using fixed time windows. A fixed time window represents a - consistent duration, non overlapping time interval in the data stream. -

-

- Kata: Please count the number of events that happened based on fixed window with - 1-day duration. -

-
-
- Use - FixedWindows with 1-day duration. -
-
- Refer to the Beam Programming Guide - - "Fixed time windows" section for more information. -
- diff --git a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.md b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.md new file mode 100644 index 0000000000000..cbff325153905 --- /dev/null +++ b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.md @@ -0,0 +1,54 @@ + + +Fixed Time Window +----------------- + +Windowing subdivides a PCollection according to the timestamps of its individual elements. +Transforms that aggregate multiple elements, such as GroupByKey and Combine, work implicitly on a +per-window basis — they process each PCollection as a succession of multiple, finite windows, +though the entire collection itself may be of unbounded size. + +In the Beam model, any PCollection (including unbounded PCollections) can be subdivided into +logical windows. Each element in a PCollection is assigned to one or more windows according to the +PCollection’s windowing function, and each individual window contains a finite number of elements. +Grouping transforms then consider each PCollection’s elements on a per-window basis. GroupByKey, +for example, implicitly groups the elements of a PCollection by key and window. + +Beam provides several windowing functions, including: + +* Fixed Time Windows +* Sliding Time Windows +* Per-Session Windows +* Single Global Window + +The simplest form of windowing is using fixed time windows. A fixed time window represents a +consistent duration, non overlapping time interval in the data stream. + +**Kata:** Please count the number of events that happened based on fixed window with 1-day duration. + +
+ Use + FixedWindows with 1-day duration. +
+ +
+ Refer to the Beam Programming Guide + + "Fixed time windows" section for more information. +
diff --git a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.py b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.py index 384eb1a97fdd3..0444becb41fe8 100644 --- a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.py +++ b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/task.py @@ -15,6 +15,7 @@ # limitations under the License. from datetime import datetime +import pytz import apache_beam as beam from apache_beam.transforms import window @@ -25,16 +26,16 @@ p = beam.Pipeline() (p | beam.Create([ - window.TimestampedValue("event", datetime.fromisoformat('2020-03-01T00:00:00+00:00').timestamp()), - window.TimestampedValue("event", datetime.fromisoformat('2020-03-01T00:00:00+00:00').timestamp()), - window.TimestampedValue("event", datetime.fromisoformat('2020-03-01T00:00:00+00:00').timestamp()), - window.TimestampedValue("event", datetime.fromisoformat('2020-03-01T00:00:00+00:00').timestamp()), - window.TimestampedValue("event", datetime.fromisoformat('2020-03-05T00:00:00+00:00').timestamp()), - window.TimestampedValue("event", datetime.fromisoformat('2020-03-05T00:00:00+00:00').timestamp()), - window.TimestampedValue("event", datetime.fromisoformat('2020-03-08T00:00:00+00:00').timestamp()), - window.TimestampedValue("event", datetime.fromisoformat('2020-03-08T00:00:00+00:00').timestamp()), - window.TimestampedValue("event", datetime.fromisoformat('2020-03-08T00:00:00+00:00').timestamp()), - window.TimestampedValue("event", datetime.fromisoformat('2020-03-10T00:00:00+00:00').timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 1, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 1, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 1, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 1, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 5, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 5, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 8, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 8, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 8, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), + window.TimestampedValue("event", datetime(2020, 3, 10, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()), ]) | beam.WindowInto(window.FixedWindows(24*60*60)) | beam.combiners.Count.PerElement() diff --git a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/tests.py b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/tests.py index ee5b2ab3e1f28..e627fb5f4ec44 100644 --- a/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/tests.py +++ b/learning/katas/python/Windowing/Fixed Time Window/Fixed Time Window/tests.py @@ -14,19 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from test_helper import failed, passed, \ - get_answer_placeholders, get_file_output, test_is_not_empty, \ - test_answer_placeholders_text_deleted - - -def test_fixedwindows(): - placeholders = get_answer_placeholders() - placeholder = placeholders[0] - - if 'FixedWindows' in placeholder: - passed() - else: - failed('Use FixedWindows') +from test_helper import failed, passed, get_file_output, test_is_not_empty def test_output(): @@ -47,6 +35,4 @@ def test_output(): if __name__ == '__main__': test_is_not_empty() - test_answer_placeholders_text_deleted() - test_fixedwindows() test_output() diff --git a/learning/katas/python/course-remote-info.yaml b/learning/katas/python/course-remote-info.yaml index 38b403da0f4b4..aa03268344306 100644 --- a/learning/katas/python/course-remote-info.yaml +++ b/learning/katas/python/course-remote-info.yaml @@ -1,2 +1,2 @@ id: 54532 -update_date: Mon, 09 Mar 2020 14:33:44 UTC +update_date: Tue, 19 May 2020 03:04:36 UTC diff --git a/learning/katas/python/requirements.txt b/learning/katas/python/requirements.txt index 94d77ad88f2f0..d3b069b0b463d 100644 --- a/learning/katas/python/requirements.txt +++ b/learning/katas/python/requirements.txt @@ -16,3 +16,5 @@ apache-beam==2.19.0 apache-beam[test]==2.19.0 + +pytz~=2019.3 \ No newline at end of file diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java index 9ed686e0ab499..22bff18dbdc21 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java @@ -126,6 +126,15 @@ public interface FlinkPipelineOptions void setFailOnCheckpointingErrors(Boolean failOnCheckpointingErrors); + @Description( + "If set, finishes the current bundle and flushes all output before checkpointing the state of the operators. " + + "By default, starts checkpointing immediately and buffers any remaining bundle output as part of the checkpoint. " + + "The setting may affect the checkpoint alignment.") + @Default.Boolean(false) + boolean getFinishBundleBeforeCheckpointing(); + + void setFinishBundleBeforeCheckpointing(boolean finishBundleBeforeCheckpointing); + @Description( "Shuts down sources which have been idle for the configured time of milliseconds. Once a source has been " + "shut down, checkpointing is not possible anymore. Shutting down the sources eventually leads to pipeline " @@ -175,7 +184,7 @@ public interface FlinkPipelineOptions @Default.Boolean(false) Boolean getDisableMetrics(); - void setDisableMetrics(Boolean enableMetrics); + void setDisableMetrics(Boolean disableMetrics); /** Enables or disables externalized checkpoints. */ @Description( diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java index 51cd98d359128..f800bc20e5125 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java @@ -189,6 +189,8 @@ public class DoFnOperator extends AbstractStreamOperator timerService; private transient PushedBackElementsHandler> pushedBackElementsHandler; @@ -294,6 +296,8 @@ public DoFnOperator( flinkOptions.getCheckpointingInterval() + Math.max(0, flinkOptions.getMinPauseBetweenCheckpoints())); } + + this.finishBundleBeforeCheckpointing = flinkOptions.getFinishBundleBeforeCheckpointing(); } // allow overriding this in WindowDoFnOperator because this one dynamically creates @@ -843,6 +847,17 @@ protected final void invokeFinishBundle() { } } + @Override + public void prepareSnapshotPreBarrier(long checkpointId) { + if (finishBundleBeforeCheckpointing) { + // We finish the bundle and flush any pending data. + // This avoids buffering any data as part of snapshotState() below. + while (bundleStarted) { + invokeFinishBundle(); + } + } + } + @Override public final void snapshotState(StateSnapshotContext context) throws Exception { if (checkpointStats != null) { @@ -855,8 +870,6 @@ public final void snapshotState(StateSnapshotContext context) throws Exception { bufferingDoFnRunner.checkpoint(context.getCheckpointId()); } - // We can't output here anymore because the checkpoint barrier has already been - // sent downstream. This is going to change with 1.6/1.7's prepareSnapshotBarrier. try { outputManager.openBuffer(); // Ensure that no new bundle gets started as part of finishing a bundle @@ -946,8 +959,15 @@ BufferedOutputManager create( /** * A {@link DoFnRunners.OutputManager} that can buffer its outputs. Uses {@link * PushedBackElementsHandler} to buffer the data. Buffering data is necessary because no elements - * can be emitted during {@code snapshotState}. This can be removed once we upgrade Flink to >= - * 1.6 which allows us to finish the bundle before the checkpoint barriers have been emitted. + * can be emitted during {@code snapshotState} which is called when the checkpoint barrier already + * has been sent downstream. Emitting elements would break the flow of checkpoint barrier and + * violate exactly-once semantics. + * + *

This buffering can be deactived using {@code + * FlinkPipelineOptions#setFinishBundleBeforeCheckpointing(true)}. If activated, we flush out + * bundle data before the barrier is sent downstream. This is done via {@code + * prepareSnapshotPreBarrier}. When Flink supports unaligned checkpoints, this should become the + * default and this class should be removed as in https://github.com/apache/beam/pull/9652. */ public static class BufferedOutputManager implements DoFnRunners.OutputManager { diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java index 897979abc464e..13c04aa47ed68 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java @@ -83,6 +83,7 @@ public void testDefaults() { assertThat(options.getCheckpointTimeoutMillis(), is(-1L)); assertThat(options.getNumConcurrentCheckpoints(), is(1)); assertThat(options.getFailOnCheckpointingErrors(), is(true)); + assertThat(options.getFinishBundleBeforeCheckpointing(), is(false)); assertThat(options.getNumberOfExecutionRetries(), is(-1)); assertThat(options.getExecutionRetryDelay(), is(-1L)); assertThat(options.getRetainExternalizedCheckpointsOnCancellation(), is(false)); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/website/PipelineOptionsTableGenerator.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/website/PipelineOptionsTableGenerator.java index 8ca490e816518..7ef13601edea5 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/website/PipelineOptionsTableGenerator.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/website/PipelineOptionsTableGenerator.java @@ -84,8 +84,8 @@ private static void printHeader() { System.out.println( ""); } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ClassLoaderFileSystem.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ClassLoaderFileSystem.java new file mode 100644 index 0000000000000..8437ccfdf5fa1 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ClassLoaderFileSystem.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io; + +import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; + +import com.google.auto.service.AutoService; +import java.io.IOException; +import java.io.InputStream; +import java.nio.channels.Channels; +import java.nio.channels.ReadableByteChannel; +import java.nio.channels.WritableByteChannel; +import java.util.Collection; +import java.util.List; +import javax.annotation.Nullable; +import org.apache.beam.sdk.annotations.Experimental; +import org.apache.beam.sdk.io.fs.CreateOptions; +import org.apache.beam.sdk.io.fs.MatchResult; +import org.apache.beam.sdk.io.fs.ResolveOptions; +import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList; + +/** A read-only {@link FileSystem} implementation looking up resources using a ClassLoader. */ +public class ClassLoaderFileSystem extends FileSystem { + + public static final String SCHEMA = "classpath"; + private static final String PREFIX = SCHEMA + "://"; + + ClassLoaderFileSystem() {} + + @Override + protected List match(List specs) throws IOException { + throw new UnsupportedOperationException("Un-globbable filesystem."); + } + + @Override + protected WritableByteChannel create( + ClassLoaderResourceId resourceId, CreateOptions createOptions) throws IOException { + throw new UnsupportedOperationException("Read-only filesystem."); + } + + @Override + protected ReadableByteChannel open(ClassLoaderResourceId resourceId) throws IOException { + ClassLoader classLoader = getClass().getClassLoader(); + InputStream inputStream = + classLoader.getResourceAsStream(resourceId.path.substring(PREFIX.length())); + if (inputStream == null) { + + throw new IOException( + "Unable to load " + + resourceId.path + + " with " + + classLoader + + " URL " + + classLoader.getResource(resourceId.path.substring(PREFIX.length()))); + } + return Channels.newChannel(inputStream); + } + + @Override + protected void copy( + List srcResourceIds, List destResourceIds) + throws IOException { + throw new UnsupportedOperationException("Read-only filesystem."); + } + + @Override + protected void rename( + List srcResourceIds, List destResourceIds) + throws IOException { + throw new UnsupportedOperationException("Read-only filesystem."); + } + + @Override + protected void delete(Collection resourceIds) throws IOException { + throw new UnsupportedOperationException("Read-only filesystem."); + } + + @Override + protected ClassLoaderResourceId matchNewResource(String path, boolean isDirectory) { + return new ClassLoaderResourceId(path); + } + + @Override + protected String getScheme() { + return SCHEMA; + } + + public static class ClassLoaderResourceId implements ResourceId { + + private final String path; + + private ClassLoaderResourceId(String path) { + checkArgument(path.startsWith(PREFIX), path); + this.path = path; + } + + @Override + public ClassLoaderResourceId resolve(String other, ResolveOptions resolveOptions) { + if (other.startsWith(PREFIX)) { + return new ClassLoaderResourceId(other); + } else if (other.startsWith("/")) { + return new ClassLoaderResourceId(SCHEMA + ":/" + other); + } else { + return new ClassLoaderResourceId(path + "/" + other); + } + } + + @Override + public ClassLoaderResourceId getCurrentDirectory() { + int ix = path.lastIndexOf('/'); + if (ix <= PREFIX.length()) { + return new ClassLoaderResourceId(PREFIX); + } else { + return new ClassLoaderResourceId(path.substring(0, ix)); + } + } + + @Override + public String getScheme() { + return SCHEMA; + } + + @Nullable + @Override + public String getFilename() { + return path; + } + + @Override + public boolean isDirectory() { + return false; + } + } + + /** {@link AutoService} registrar for the {@link ClassLoaderFileSystem}. */ + @AutoService(FileSystemRegistrar.class) + @Experimental(Experimental.Kind.FILESYSTEM) + public static class ClassLoaderFileSystemRegistrar implements FileSystemRegistrar { + @Override + public Iterable fromOptions(@Nullable PipelineOptions options) { + return ImmutableList.of(new ClassLoaderFileSystem()); + } + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/Date.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/Date.java new file mode 100644 index 0000000000000..d942340db5d27 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/Date.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.logicaltypes; + +import java.time.LocalDate; +import org.apache.beam.sdk.schemas.Schema; + +/** + * A date without a time-zone. + * + *

It cannot represent an instant on the time-line without additional information such as an + * offset or time-zone. + * + *

Its input type is a {@link LocalDate}, and base type is a {@link Long} that represents a + * incrementing count of days where day 0 is 1970-01-01 (ISO). + */ +public class Date implements Schema.LogicalType { + + @Override + public String getIdentifier() { + return "beam:logical_type:date:v1"; + } + + // unused + @Override + public Schema.FieldType getArgumentType() { + return Schema.FieldType.STRING; + } + + // unused + @Override + public String getArgument() { + return ""; + } + + @Override + public Schema.FieldType getBaseType() { + return Schema.FieldType.INT64; + } + + @Override + public Long toBaseType(LocalDate input) { + return input == null ? null : input.toEpochDay(); + } + + @Override + public LocalDate toInputType(Long base) { + return base == null ? null : LocalDate.ofEpochDay(base); + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/SqlTypes.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/SqlTypes.java new file mode 100644 index 0000000000000..4dbab48251596 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/SqlTypes.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.logicaltypes; + +import java.time.LocalDate; +import org.apache.beam.sdk.schemas.Schema.LogicalType; + +/** Beam {@link org.apache.beam.sdk.schemas.Schema.LogicalType}s corresponding to SQL data types. */ +public class SqlTypes { + + private SqlTypes() {} + + /** Beam LogicalType corresponding to ZetaSQL/CalciteSQL DATE type. */ + public static final LogicalType DATE = new Date(); +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/ClassLoaderFileSystemTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/ClassLoaderFileSystemTest.java new file mode 100644 index 0000000000000..2667196fb3383 --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/ClassLoaderFileSystemTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io; + +import static java.nio.channels.Channels.newInputStream; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.channels.ReadableByteChannel; +import org.apache.beam.sdk.io.fs.ResolveOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class ClassLoaderFileSystemTest { + + private static final String SOME_CLASS = + "classpath://org/apache/beam/sdk/io/ClassLoaderFileSystem.class"; + + @Test + public void testOpen() throws IOException { + ClassLoaderFileSystem filesystem = new ClassLoaderFileSystem(); + ReadableByteChannel channel = filesystem.open(filesystem.matchNewResource(SOME_CLASS, false)); + checkIsClass(channel); + } + + @Test + public void testRegistrar() throws IOException { + ReadableByteChannel channel = FileSystems.open(FileSystems.matchNewResource(SOME_CLASS, false)); + checkIsClass(channel); + } + + @Test + public void testResolve() throws IOException { + ClassLoaderFileSystem filesystem = new ClassLoaderFileSystem(); + ClassLoaderFileSystem.ClassLoaderResourceId original = + filesystem.matchNewResource(SOME_CLASS, false); + ClassLoaderFileSystem.ClassLoaderResourceId parent = original.getCurrentDirectory(); + ClassLoaderFileSystem.ClassLoaderResourceId grandparent = parent.getCurrentDirectory(); + assertEquals("classpath://org/apache/beam/sdk", grandparent.getFilename()); + ClassLoaderFileSystem.ClassLoaderResourceId resource = + grandparent + .resolve("io", ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY) + .resolve( + "ClassLoaderFileSystem.class", ResolveOptions.StandardResolveOptions.RESOLVE_FILE); + ReadableByteChannel channel = filesystem.open(resource); + checkIsClass(channel); + } + + public void checkIsClass(ReadableByteChannel channel) throws IOException { + FileSystems.setDefaultPipelineOptions(PipelineOptionsFactory.create()); + InputStream inputStream = newInputStream(channel); + byte[] magic = new byte[4]; + inputStream.read(magic); + assertArrayEquals(magic, new byte[] {(byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE}); + } +} diff --git a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/SchemaUtils.java b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/SchemaUtils.java index 8e76f4cc7bb1e..0ce9f335bc895 100644 --- a/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/SchemaUtils.java +++ b/sdks/java/extensions/sql/datacatalog/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/datacatalog/SchemaUtils.java @@ -28,6 +28,7 @@ import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.Field; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.base.Strings; import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableMap; @@ -38,7 +39,7 @@ class SchemaUtils { ImmutableMap.builder() .put("BOOL", FieldType.BOOLEAN) .put("BYTES", FieldType.BYTES) - .put("DATE", FieldType.logicalType(new CalciteUtils.DateType())) + .put("DATE", FieldType.logicalType(SqlTypes.DATE)) .put("DATETIME", FieldType.DATETIME) .put("DOUBLE", FieldType.DOUBLE) .put("FLOAT", FieldType.DOUBLE) diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamCalcRel.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamCalcRel.java index b9821aa1f237c..25392ff7cc5ee 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamCalcRel.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamCalcRel.java @@ -20,13 +20,13 @@ import static org.apache.beam.sdk.schemas.Schema.FieldType; import static org.apache.beam.sdk.schemas.Schema.TypeName; import static org.apache.beam.vendor.calcite.v1_20_0.com.google.common.base.Preconditions.checkArgument; -import static org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.avatica.util.DateTimeUtils.MILLIS_PER_DAY; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.lang.reflect.Modifier; import java.lang.reflect.Type; import java.math.BigDecimal; +import java.time.LocalDate; import java.util.AbstractList; import java.util.AbstractMap; import java.util.Arrays; @@ -39,11 +39,11 @@ import org.apache.beam.sdk.extensions.sql.impl.planner.BeamJavaTypeFactory; import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils; import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.CharType; -import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.DateType; import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.TimeType; import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.TimeWithLocalTzType; import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.TimestampWithLocalTzType; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; @@ -315,7 +315,7 @@ private static Expression castOutput(Expression value, FieldType toType) { private static Expression castOutputTime(Expression value, FieldType toType) { Expression valueDateTime = value; - // First, convert to millis + // First, convert to millis (except for DATE type) if (CalciteUtils.TIMESTAMP.typesEqual(toType) || CalciteUtils.NULLABLE_TIMESTAMP.typesEqual(toType)) { if (value.getType() == java.sql.Timestamp.class) { @@ -331,13 +331,16 @@ private static Expression castOutputTime(Expression value, FieldType toType) { if (value.getType() == java.sql.Date.class) { valueDateTime = Expressions.call(BuiltInMethod.DATE_TO_INT.method, valueDateTime); } - valueDateTime = Expressions.multiply(valueDateTime, Expressions.constant(MILLIS_PER_DAY)); } else { throw new UnsupportedOperationException("Unknown DateTime type " + toType); } - // Second, convert to joda Instant - valueDateTime = Expressions.new_(Instant.class, valueDateTime); + // Second, convert to joda Instant (or LocalDate for DATE type) + if (CalciteUtils.DATE.typesEqual(toType) || CalciteUtils.NULLABLE_DATE.typesEqual(toType)) { + valueDateTime = Expressions.call(LocalDate.class, "ofEpochDay", valueDateTime); + } else { + valueDateTime = Expressions.new_(Instant.class, valueDateTime); + } // Third, make conversion conditional on non-null input. if (!((Class) value.getType()).isPrimitive()) { @@ -371,9 +374,9 @@ private static class InputGetterImpl implements RexToLixTranslator.InputGetter { .put(TypeName.ROW, Row.class) .build(); - private static final Map LOGICAL_TYPE_CONVERSION_MAP = + private static final Map LOGICAL_TYPE_TO_BASE_TYPE_MAP = ImmutableMap.builder() - .put(DateType.IDENTIFIER, ReadableInstant.class) + .put(SqlTypes.DATE.getIdentifier(), Long.class) .put(TimeType.IDENTIFIER, ReadableInstant.class) .put(TimeWithLocalTzType.IDENTIFIER, ReadableInstant.class) .put(TimestampWithLocalTzType.IDENTIFIER, ReadableInstant.class) @@ -406,7 +409,7 @@ private static Expression value( if (storageType == Object.class) { convertTo = Object.class; } else if (fromType.getTypeName().isLogicalType()) { - convertTo = LOGICAL_TYPE_CONVERSION_MAP.get(fromType.getLogicalType().getIdentifier()); + convertTo = LOGICAL_TYPE_TO_BASE_TYPE_MAP.get(fromType.getLogicalType().getIdentifier()); } else { convertTo = TYPE_CONVERSION_MAP.get(fromType.getTypeName()); } @@ -427,18 +430,13 @@ private static Expression value( private static Expression value(Expression value, Schema.FieldType type) { if (type.getTypeName().isLogicalType()) { - Expression millisField = Expressions.call(value, "getMillis"); String logicalId = type.getLogicalType().getIdentifier(); - if (logicalId.equals(TimeType.IDENTIFIER)) { - return nullOr(value, Expressions.convert_(millisField, int.class)); - } else if (logicalId.equals(DateType.IDENTIFIER)) { - value = - nullOr( - value, - Expressions.convert_( - Expressions.divide(millisField, Expressions.constant(MILLIS_PER_DAY)), - int.class)); - } else if (!logicalId.equals(CharType.IDENTIFIER)) { + if (TimeType.IDENTIFIER.equals(logicalId)) { + return nullOr( + value, Expressions.convert_(Expressions.call(value, "getMillis"), int.class)); + } else if (SqlTypes.DATE.getIdentifier().equals(logicalId)) { + value = nullOr(value, value); + } else if (!CharType.IDENTIFIER.equals(logicalId)) { throw new UnsupportedOperationException( "Unknown LogicalType " + type.getLogicalType().getIdentifier()); } diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamEnumerableConverter.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamEnumerableConverter.java index 11820a0df9439..1a688df933307 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamEnumerableConverter.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamEnumerableConverter.java @@ -18,9 +18,9 @@ package org.apache.beam.sdk.extensions.sql.impl.rel; import static org.apache.beam.vendor.calcite.v1_20_0.com.google.common.base.Preconditions.checkArgument; -import static org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.avatica.util.DateTimeUtils.MILLIS_PER_DAY; import java.io.IOException; +import java.time.LocalDate; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -36,7 +36,6 @@ import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.PipelineResult.State; import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.CharType; -import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.DateType; import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.TimeType; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.metrics.Counter; @@ -50,6 +49,7 @@ import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.runners.TransformHierarchy.Node; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.PCollection; @@ -303,11 +303,15 @@ private static Object fieldToAvatica(Schema.FieldType type, Object beamValue) { switch (type.getTypeName()) { case LOGICAL_TYPE: String logicalId = type.getLogicalType().getIdentifier(); - if (logicalId.equals(TimeType.IDENTIFIER)) { + if (TimeType.IDENTIFIER.equals(logicalId)) { return (int) ((ReadableInstant) beamValue).getMillis(); - } else if (logicalId.equals(DateType.IDENTIFIER)) { - return (int) (((ReadableInstant) beamValue).getMillis() / MILLIS_PER_DAY); - } else if (logicalId.equals(CharType.IDENTIFIER)) { + } else if (SqlTypes.DATE.getIdentifier().equals(logicalId)) { + if (beamValue instanceof Long) { // base type + return ((Long) beamValue).intValue(); + } else { // input type + return (int) (((LocalDate) beamValue).toEpochDay()); + } + } else if (CharType.IDENTIFIER.equals(logicalId)) { return beamValue; } else { throw new UnsupportedOperationException("Unknown DateTime type " + logicalId); diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/schema/BeamTableUtils.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/schema/BeamTableUtils.java index f3c2704cdbc4a..aa8767464ef78 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/schema/BeamTableUtils.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/schema/BeamTableUtils.java @@ -22,7 +22,10 @@ import java.io.IOException; import java.io.StringWriter; import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; import java.util.ArrayList; +import java.util.GregorianCalendar; import java.util.List; import java.util.stream.IntStream; import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils; @@ -114,8 +117,17 @@ public static Object autoCastField(Schema.Field field, Object rawObj) { } else { return rawObj; } + } else if (CalciteUtils.DATE.typesEqual(type) || CalciteUtils.NULLABLE_DATE.typesEqual(type)) { + if (rawObj instanceof GregorianCalendar) { // used by the SQL CLI + GregorianCalendar calendar = (GregorianCalendar) rawObj; + return Instant.ofEpochMilli(calendar.getTimeInMillis()) + .atZone(calendar.getTimeZone().toZoneId()) + .toLocalDate(); + } else { + return LocalDate.ofEpochDay((Integer) rawObj); + } } else if (CalciteUtils.isDateTimeType(type)) { - // Internal representation of DateType in Calcite is convertible to Joda's Datetime. + // Internal representation of Date in Calcite is convertible to Joda's Datetime. return new DateTime(rawObj); } else if (type.getTypeName().isNumericType() && ((rawObj instanceof String) diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java index e0b994d5815f7..8326567377548 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java @@ -25,6 +25,7 @@ import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.schemas.Schema.TypeName; import org.apache.beam.sdk.schemas.logicaltypes.PassThroughLogicalType; +import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.BiMap; import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableBiMap; import org.apache.beam.vendor.calcite.v1_20_0.com.google.common.collect.ImmutableMap; @@ -43,15 +44,6 @@ public class CalciteUtils { // SQL has schema types that do not directly correspond to Beam Schema types. We define // LogicalTypes to represent each of these types. - /** A LogicalType corresponding to DATE. */ - public static class DateType extends PassThroughLogicalType { - public static final String IDENTIFIER = "SqlDateType"; - - public DateType() { - super(IDENTIFIER, FieldType.STRING, "", FieldType.DATETIME); - } - } - /** A LogicalType corresponding to TIME. */ public static class TimeType extends PassThroughLogicalType { public static final String IDENTIFIER = "SqlTimeType"; @@ -96,7 +88,7 @@ public static boolean isDateTimeType(FieldType fieldType) { if (fieldType.getTypeName().isLogicalType()) { String logicalId = fieldType.getLogicalType().getIdentifier(); - return logicalId.equals(DateType.IDENTIFIER) + return logicalId.equals(SqlTypes.DATE.getIdentifier()) || logicalId.equals(TimeType.IDENTIFIER) || logicalId.equals(TimeWithLocalTzType.IDENTIFIER) || logicalId.equals(TimestampWithLocalTzType.IDENTIFIER); @@ -128,9 +120,9 @@ public static boolean isStringType(FieldType fieldType) { public static final FieldType VARBINARY = FieldType.BYTES; public static final FieldType VARCHAR = FieldType.STRING; public static final FieldType CHAR = FieldType.logicalType(new CharType()); - public static final FieldType DATE = FieldType.logicalType(new DateType()); + public static final FieldType DATE = FieldType.logicalType(SqlTypes.DATE); public static final FieldType NULLABLE_DATE = - FieldType.logicalType(new DateType()).withNullable(true); + FieldType.logicalType(SqlTypes.DATE).withNullable(true); public static final FieldType TIME = FieldType.logicalType(new TimeType()); public static final FieldType NULLABLE_TIME = FieldType.logicalType(new TimeType()).withNullable(true); @@ -205,12 +197,16 @@ public static SqlTypeName toSqlTypeName(FieldType type) { return SqlTypeName.MAP; default: SqlTypeName typeName = BEAM_TO_CALCITE_TYPE_MAPPING.get(type.withNullable(false)); - if (typeName != null) { - return typeName; - } else { + if (typeName == null) { // This will happen e.g. if looking up a STRING type, and metadata isn't set to say which // type of SQL string we want. In this case, use the default mapping. - return BEAM_TO_CALCITE_DEFAULT_MAPPING.get(type); + typeName = BEAM_TO_CALCITE_DEFAULT_MAPPING.get(type); + } + if (typeName == null) { + throw new IllegalArgumentException( + String.format("Cannot find a matching Calcite SqlTypeName for Beam type: %s", type)); + } else { + return typeName; } } } diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamComplexTypeTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamComplexTypeTest.java index 8b3d7e502d26e..7cfcb951ca56c 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamComplexTypeTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamComplexTypeTest.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.extensions.sql; +import java.time.LocalDate; import java.util.Arrays; import java.util.HashMap; import java.util.Map; @@ -26,6 +27,7 @@ import org.apache.beam.sdk.extensions.sql.meta.provider.test.TestBoundedTable; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; @@ -373,40 +375,6 @@ public void testNullInnerRow() { pipeline.run().waitUntilFinish(Duration.standardMinutes(2)); } - @Test - public void testLogicalTypes() { - DateTime dateTime = DateTime.parse("2020-02-02T00:00:00"); - - Schema inputRowSchema = - Schema.builder() - .addField("timeTypeField", FieldType.logicalType(new DummySqlTimeType())) - .addField("dateTypeField", FieldType.logicalType(new DummySqlDateType())) - .build(); - - Row row = - Row.withSchema(inputRowSchema) - .addValues(dateTime.getMillis(), dateTime.getMillis()) - .build(); - - Schema outputRowSchema = - Schema.builder() - .addField("timeTypeField", FieldType.DATETIME) - .addNullableField("dateTypeField", FieldType.DATETIME) - .build(); - - PCollection outputRow = - pipeline - .apply(Create.of(row).withRowSchema(inputRowSchema)) - .apply( - SqlTransform.query( - "SELECT timeTypeField, dateTypeField FROM PCOLLECTION GROUP BY timeTypeField, dateTypeField")); - - PAssert.that(outputRow) - .containsInAnyOrder(Row.withSchema(outputRowSchema).addValues(dateTime, dateTime).build()); - - pipeline.run().waitUntilFinish(Duration.standardMinutes(2)); - } - private static class DummySqlTimeType implements Schema.LogicalType { @Override public String getIdentifier() { @@ -439,38 +407,6 @@ public Long toInputType(Instant base) { } } - private static class DummySqlDateType implements Schema.LogicalType { - @Override - public String getIdentifier() { - return "SqlDateType"; - } - - @Override - public FieldType getArgumentType() { - return FieldType.STRING; - } - - @Override - public String getArgument() { - return ""; - } - - @Override - public Schema.FieldType getBaseType() { - return Schema.FieldType.DATETIME; - } - - @Override - public Instant toBaseType(Long input) { - return (input == null ? null : new Instant((long) input)); - } - - @Override - public Long toInputType(Instant base) { - return (base == null ? null : base.getMillis()); - } - } - @Test public void testNullDatetimeFields() { Instant current = new Instant(1561671380000L); // Long value corresponds to 27/06/2019 @@ -483,14 +419,13 @@ public void testNullDatetimeFields() { .addField("timeTypeField", FieldType.logicalType(new DummySqlTimeType())) .addNullableField( "nullableTimeTypeField", FieldType.logicalType(new DummySqlTimeType())) - .addField("dateTypeField", FieldType.logicalType(new DummySqlDateType())) - .addNullableField( - "nullableDateTypeField", FieldType.logicalType(new DummySqlDateType())) + .addField("dateTypeField", FieldType.logicalType(SqlTypes.DATE)) + .addNullableField("nullableDateTypeField", FieldType.logicalType(SqlTypes.DATE)) .build(); Row dateTimeRow = Row.withSchema(dateTimeFieldSchema) - .addValues(current, null, date.getMillis(), null, current.getMillis(), null) + .addValues(current, null, date.getMillis(), null, LocalDate.of(2019, 6, 27), null) .build(); PCollection outputRow = diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlCastTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlCastTest.java index 01872a57dc71e..8f5fe6500c22e 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlCastTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlCastTest.java @@ -17,16 +17,15 @@ */ package org.apache.beam.sdk.extensions.sql; -import static org.apache.beam.sdk.schemas.Schema.FieldType.DATETIME; -import static org.joda.time.DateTimeZone.UTC; - +import java.time.LocalDate; +import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; -import org.joda.time.DateTime; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; @@ -48,7 +47,10 @@ public void testCastToDate() { .withRowSchema(INPUT_ROW_SCHEMA)); Schema resultType = - Schema.builder().addInt32Field("f_int").addNullableField("f_date", DATETIME).build(); + Schema.builder() + .addInt32Field("f_int") + .addNullableField("f_date", CalciteUtils.DATE) + .build(); PCollection result = input.apply( @@ -64,7 +66,7 @@ public void testCastToDate() { PAssert.that(result) .containsInAnyOrder( - Row.withSchema(resultType).addValues(1, new DateTime(2018, 10, 18, 0, 0, UTC)).build()); + Row.withSchema(resultType).addValues(1, LocalDate.of(2018, 10, 18)).build()); pipeline.run(); } @@ -76,7 +78,11 @@ public void testCastToDateWithCase() { Create.of(Row.withSchema(INPUT_ROW_SCHEMA).addValues(1).addValue("20181018").build()) .withRowSchema(INPUT_ROW_SCHEMA)); - Schema resultType = Schema.builder().addInt32Field("f_int").addDateTimeField("f_date").build(); + Schema resultType = + Schema.builder() + .addInt32Field("f_int") + .addLogicalTypeField("f_date", SqlTypes.DATE) + .build(); PCollection result = input.apply( @@ -96,7 +102,7 @@ public void testCastToDateWithCase() { PAssert.that(result) .containsInAnyOrder( - Row.withSchema(resultType).addValues(1, new DateTime(2018, 10, 18, 0, 0, UTC)).build()); + Row.withSchema(resultType).addValues(1, LocalDate.of(2018, 10, 18)).build()); pipeline.run(); } diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlCliTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlCliTest.java index 214c4fa225e4e..5672849622dcb 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlCliTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlCliTest.java @@ -28,6 +28,7 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; +import java.time.LocalDate; import java.util.stream.Stream; import org.apache.beam.sdk.extensions.sql.impl.ParseException; import org.apache.beam.sdk.extensions.sql.meta.Table; @@ -268,7 +269,7 @@ public void test_time_types() throws Exception { assertEquals(3, row.getFieldCount()); // test DATE field - assertEquals("2018-11-01", row.getDateTime("f_date").toString("yyyy-MM-dd")); + assertEquals("2018-11-01", row.getLogicalTypeValue("f_date", LocalDate.class).toString()); // test TIME field assertEquals("15:23:59.000", row.getDateTime("f_time").toString("HH:mm:ss.SSS")); // test TIMESTAMP field diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlDslSqlStdOperatorsTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlDslSqlStdOperatorsTest.java index 28ac9f41a159f..87cc62570d47c 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlDslSqlStdOperatorsTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlDslSqlStdOperatorsTest.java @@ -17,7 +17,6 @@ */ package org.apache.beam.sdk.extensions.sql; -import static org.apache.beam.sdk.extensions.sql.utils.DateTimeUtils.parseDate; import static org.apache.beam.sdk.extensions.sql.utils.DateTimeUtils.parseTime; import static org.apache.beam.sdk.extensions.sql.utils.DateTimeUtils.parseTimestampWithUTCTimeZone; import static org.hamcrest.Matchers.equalTo; @@ -33,6 +32,7 @@ import java.lang.reflect.Method; import java.math.BigDecimal; import java.math.RoundingMode; +import java.time.LocalDate; import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; @@ -40,6 +40,7 @@ import java.util.Random; import java.util.Set; import java.util.stream.Collectors; +import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils; import org.apache.beam.sdk.extensions.sql.integrationtest.BeamSqlBuiltinFunctionsIntegrationTestBase; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; @@ -1196,9 +1197,18 @@ public void testDatetimeInfixPlus() { .addExpr( "TIMESTAMP '1984-01-19 01:02:03' + INTERVAL '2' YEAR", parseTimestampWithUTCTimeZone("1986-01-19 01:02:03")) - .addExpr("DATE '1984-04-19' + INTERVAL '2' DAY", parseDate("1984-04-21")) - .addExpr("DATE '1984-04-19' + INTERVAL '1' MONTH", parseDate("1984-05-19")) - .addExpr("DATE '1984-04-19' + INTERVAL '3' YEAR", parseDate("1987-04-19")) + .addExpr( + "DATE '1984-04-19' + INTERVAL '2' DAY", + LocalDate.parse("1984-04-21"), + CalciteUtils.DATE) + .addExpr( + "DATE '1984-04-19' + INTERVAL '1' MONTH", + LocalDate.parse("1984-05-19"), + CalciteUtils.DATE) + .addExpr( + "DATE '1984-04-19' + INTERVAL '3' YEAR", + LocalDate.parse("1987-04-19"), + CalciteUtils.DATE) .addExpr("TIME '14:28:30' + INTERVAL '15' SECOND", parseTime("14:28:45")) .addExpr("TIME '14:28:30.239' + INTERVAL '4' MINUTE", parseTime("14:32:30.239")) .addExpr("TIME '14:28:30.2' + INTERVAL '4' HOUR", parseTime("18:28:30.2")); @@ -1317,9 +1327,18 @@ public void testTimestampMinusInterval() { .addExpr( "TIMESTAMP '1984-01-19 01:01:58' - INTERVAL '1' YEAR", parseTimestampWithUTCTimeZone("1983-01-19 01:01:58")) - .addExpr("DATE '1984-04-19' - INTERVAL '2' DAY", parseDate("1984-04-17")) - .addExpr("DATE '1984-04-19' - INTERVAL '1' MONTH", parseDate("1984-03-19")) - .addExpr("DATE '1984-04-19' - INTERVAL '3' YEAR", parseDate("1981-04-19")) + .addExpr( + "DATE '1984-04-19' - INTERVAL '2' DAY", + LocalDate.parse("1984-04-17"), + CalciteUtils.DATE) + .addExpr( + "DATE '1984-04-19' - INTERVAL '1' MONTH", + LocalDate.parse("1984-03-19"), + CalciteUtils.DATE) + .addExpr( + "DATE '1984-04-19' - INTERVAL '3' YEAR", + LocalDate.parse("1981-04-19"), + CalciteUtils.DATE) .addExpr("TIME '14:28:30' - INTERVAL '15' SECOND", parseTime("14:28:15")) .addExpr("TIME '14:28:30.239' - INTERVAL '4' MINUTE", parseTime("14:24:30.239")) .addExpr("TIME '14:28:30.2' - INTERVAL '4' HOUR", parseTime("10:28:30.2")); diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/integrationtest/BeamSqlDateFunctionsIntegrationTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/integrationtest/BeamSqlDateFunctionsIntegrationTest.java index 0342cf03beb68..c25d70e08823b 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/integrationtest/BeamSqlDateFunctionsIntegrationTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/integrationtest/BeamSqlDateFunctionsIntegrationTest.java @@ -21,6 +21,7 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import java.time.LocalDate; import java.util.Iterator; import org.apache.beam.sdk.extensions.sql.SqlTransform; import org.apache.beam.sdk.testing.PAssert; @@ -71,8 +72,12 @@ public Void apply(Iterable input) { assertTrue(millis - row.getDateTime(1).getMillis() > -1000); // CURRENT_DATE - assertTrue(millis - row.getDateTime(2).getMillis() < MILLIS_PER_DAY); - assertTrue(millis - row.getDateTime(2).getMillis() > -MILLIS_PER_DAY); + assertTrue( + millis - row.getLogicalTypeValue(2, LocalDate.class).toEpochDay() * MILLIS_PER_DAY + < MILLIS_PER_DAY); + assertTrue( + millis - row.getLogicalTypeValue(2, LocalDate.class).toEpochDay() * MILLIS_PER_DAY + > -MILLIS_PER_DAY); // CURRENT_TIME assertTrue(timeMillis - row.getDateTime(3).getMillis() < 1000); diff --git a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/SqlStdOperatorMappingTable.java b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/SqlStdOperatorMappingTable.java index 22b2de97caaf8..aa807fe80933d 100644 --- a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/SqlStdOperatorMappingTable.java +++ b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/SqlStdOperatorMappingTable.java @@ -92,13 +92,28 @@ public class SqlStdOperatorMappingTable { FunctionSignatureId.FN_IFNULL, FunctionSignatureId.FN_NULLIF, + // Date functions + FunctionSignatureId.FN_CURRENT_DATE, // current_date + FunctionSignatureId.FN_EXTRACT_FROM_DATE, // $extract + FunctionSignatureId.FN_DATE_FROM_YEAR_MONTH_DAY, // date + FunctionSignatureId.FN_DATE_FROM_TIMESTAMP, // date + // FunctionSignatureId.FN_DATE_FROM_DATETIME, // date + FunctionSignatureId.FN_DATE_ADD_DATE, // date_add + FunctionSignatureId.FN_DATE_SUB_DATE, // date_sub + FunctionSignatureId.FN_DATE_DIFF_DATE, // date_diff + FunctionSignatureId.FN_DATE_TRUNC_DATE, // date_trunc + FunctionSignatureId.FN_FORMAT_DATE, // format_date + FunctionSignatureId.FN_PARSE_DATE, // parse_date + FunctionSignatureId.FN_UNIX_DATE, // unix_date + FunctionSignatureId.FN_DATE_FROM_UNIX_DATE, // date_from_unix_date + // Timestamp functions FunctionSignatureId.FN_CURRENT_TIMESTAMP, // current_timestamp FunctionSignatureId.FN_EXTRACT_FROM_TIMESTAMP, // $extract FunctionSignatureId.FN_STRING_FROM_TIMESTAMP, // string FunctionSignatureId.FN_TIMESTAMP_FROM_STRING, // timestamp FunctionSignatureId.FN_TIMESTAMP_FROM_DATE, // timestamp - // FunctionSignatureId.FN_TIMESTAMP_FROM_DATETIME // timestamp + // FunctionSignatureId.FN_TIMESTAMP_FROM_DATETIME, // timestamp FunctionSignatureId.FN_TIMESTAMP_ADD, // timestamp_add FunctionSignatureId.FN_TIMESTAMP_SUB, // timestamp_sub FunctionSignatureId.FN_TIMESTAMP_DIFF, // timestamp_diff @@ -115,13 +130,9 @@ public class SqlStdOperatorMappingTable { FunctionSignatureId.FN_TIMESTAMP_FROM_UNIX_MILLIS_INT64, // timestamp_from_unix_millis // FunctionSignatureId.FN_TIMESTAMP_FROM_UNIX_MICROS_INT64, // timestamp_from_unix_micros - // Date/Time/Datetime functions - FunctionSignatureId.FN_EXTRACT_FROM_DATE, + // Time/Datetime functions FunctionSignatureId.FN_EXTRACT_FROM_DATETIME, - FunctionSignatureId.FN_EXTRACT_FROM_TIME, - FunctionSignatureId.FN_DATE_FROM_YEAR_MONTH_DAY - // TODO: FunctionSignatureId.FN_DATE_FROM_TIMESTAMP - ); + FunctionSignatureId.FN_EXTRACT_FROM_TIME); // todo: Some of operators defined here are later overridden in ZetaSQLPlannerImpl. // We should remove them from this table and add generic way to provide custom @@ -314,11 +325,6 @@ public class SqlStdOperatorMappingTable { // .put("sha256") // .put("sha512") - // date functions - // .put("date_add", SqlStdOperatorTable.DATETIME_PLUS) - // .put("date_sub", SqlStdOperatorTable.MINUS_DATE) - .put("date", SqlOperators.DATE_OP) - // time functions // .put("time_add", SqlStdOperatorTable.DATETIME_PLUS) // .put("time_sub", SqlStdOperatorTable.MINUS_DATE) diff --git a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/TestInput.java b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/TestInput.java index a30825c2281d7..df5ff9b7d096e 100644 --- a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/TestInput.java +++ b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/TestInput.java @@ -18,20 +18,18 @@ package org.apache.beam.sdk.extensions.sql.zetasql; import java.nio.charset.StandardCharsets; +import java.time.LocalDate; import java.util.Arrays; -import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.DateType; -import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.TimeType; import org.apache.beam.sdk.extensions.sql.meta.provider.test.TestBoundedTable; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap; /** TestInput. */ class TestInput { - public static final FieldType DATE = FieldType.logicalType(new DateType()); - public static final FieldType TIME = FieldType.logicalType(new TimeType()); public static final TestBoundedTable BASIC_TABLE_ONE = TestBoundedTable.of( @@ -155,20 +153,6 @@ class TestInput { DateTimeUtils.parseTimestampWithUTCTimeZone("2018-07-01 21:26:13"), 7L); - public static final TestBoundedTable TIME_TABLE = - TestBoundedTable.of( - Schema.builder() - .addNullableField("f_date", DATE) - .addNullableField("f_time", TIME) - .addNullableField("f_timestamp", FieldType.DATETIME) - .addNullableField("f_timestamp_with_time_zone", FieldType.DATETIME) - .build()) - .addRows( - DateTimeUtils.parseTimestampWithUTCTimeZone("2018-07-11 00:00:00"), - DateTimeUtils.parseTimestampWithUTCTimeZone("1970-01-01 12:33:59.348"), - DateTimeUtils.parseTimestampWithUTCTimeZone("2018-12-20 23:59:59.999"), - DateTimeUtils.parseTimestampWithTimeZone("2018-12-10 10:38:59-1000")); - public static final TestBoundedTable TABLE_ALL_NULL = TestBoundedTable.of( Schema.builder() @@ -249,6 +233,16 @@ class TestInput { ImmutableMap.of("MAP_KEY_1", "MAP_VALUE_1"), Row.withSchema(structSchema).addValues(1L, "data1").build()); + private static final Schema TABLE_WTH_DATE_SCHEMA = + Schema.builder() + .addLogicalTypeField("date_field", SqlTypes.DATE) + .addStringField("str_field") + .build(); + public static final TestBoundedTable TABLE_WITH_DATE = + TestBoundedTable.of(TABLE_WTH_DATE_SCHEMA) + .addRows(LocalDate.of(2008, 12, 25), "str1") + .addRows(LocalDate.of(2020, 04, 07), "str2"); + public static byte[] stringToBytes(String s) { return s.getBytes(StandardCharsets.UTF_8); } diff --git a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSqlUtils.java b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSqlUtils.java index f74e35ff245c0..63c89cbfa95a8 100644 --- a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSqlUtils.java +++ b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSqlUtils.java @@ -25,15 +25,16 @@ import com.google.zetasql.TypeFactory; import com.google.zetasql.Value; import com.google.zetasql.ZetaSQLType.TypeKind; +import java.time.LocalDate; import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.DateType; import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.TimeType; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.Field; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.type.SqlTypeName; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.math.LongMath; @@ -52,7 +53,8 @@ public final class ZetaSqlUtils { private ZetaSqlUtils() {} - public static SqlTypeName zetaSqlTypeToCalciteType(TypeKind zetaSqlType) { + // Type conversion: ZetaSQL => Calcite + public static SqlTypeName zetaSqlTypeToCalciteTypeName(TypeKind zetaSqlType) { switch (zetaSqlType) { case TYPE_INT64: return SqlTypeName.BIGINT; @@ -62,6 +64,8 @@ public static SqlTypeName zetaSqlTypeToCalciteType(TypeKind zetaSqlType) { return SqlTypeName.DOUBLE; case TYPE_STRING: return SqlTypeName.VARCHAR; + case TYPE_DATE: + return SqlTypeName.DATE; case TYPE_TIMESTAMP: return SqlTypeName.TIMESTAMP; case TYPE_BOOL: @@ -70,10 +74,11 @@ public static SqlTypeName zetaSqlTypeToCalciteType(TypeKind zetaSqlType) { return SqlTypeName.VARBINARY; // TODO[BEAM-9179] Add conversion code for ARRAY and ROW types default: - throw new IllegalArgumentException("Unsupported ZetaSQL type: " + zetaSqlType.name()); + throw new UnsupportedOperationException("Unknown ZetaSQL type: " + zetaSqlType.name()); } } + // Type conversion: Beam => ZetaSQL public static Type beamFieldTypeToZetaSqlType(FieldType fieldType) { switch (fieldType.getTypeName()) { case INT64: @@ -93,31 +98,22 @@ public static Type beamFieldTypeToZetaSqlType(FieldType fieldType) { case BYTES: return TypeFactory.createSimpleType(TypeKind.TYPE_BYTES); case ARRAY: - return createZetaSqlArrayTypeFromBeamElementFieldType(fieldType.getCollectionElementType()); + return beamElementFieldTypeToZetaSqlArrayType(fieldType.getCollectionElementType()); case ROW: - return createZetaSqlStructTypeFromBeamSchema(fieldType.getRowSchema()); + return beamSchemaToZetaSqlStructType(fieldType.getRowSchema()); case LOGICAL_TYPE: - switch (fieldType.getLogicalType().getIdentifier()) { - case DateType.IDENTIFIER: - return TypeFactory.createSimpleType(TypeKind.TYPE_DATE); - case TimeType.IDENTIFIER: - return TypeFactory.createSimpleType(TypeKind.TYPE_TIME); - default: - throw new IllegalArgumentException( - "Unsupported Beam logical type: " + fieldType.getLogicalType().getIdentifier()); - } + return beamLogicalTypeToZetaSqlType(fieldType.getLogicalType().getIdentifier()); default: throw new UnsupportedOperationException( - "Unsupported Beam fieldType: " + fieldType.getTypeName()); + "Unknown Beam fieldType: " + fieldType.getTypeName()); } } - private static ArrayType createZetaSqlArrayTypeFromBeamElementFieldType( - FieldType elementFieldType) { + private static ArrayType beamElementFieldTypeToZetaSqlArrayType(FieldType elementFieldType) { return TypeFactory.createArrayType(beamFieldTypeToZetaSqlType(elementFieldType)); } - public static StructType createZetaSqlStructTypeFromBeamSchema(Schema schema) { + public static StructType beamSchemaToZetaSqlStructType(Schema schema) { return TypeFactory.createStructType( schema.getFields().stream() .map(ZetaSqlUtils::beamFieldToZetaSqlStructField) @@ -128,6 +124,17 @@ private static StructField beamFieldToZetaSqlStructField(Field field) { return new StructField(field.getName(), beamFieldTypeToZetaSqlType(field.getType())); } + private static Type beamLogicalTypeToZetaSqlType(String identifier) { + if (SqlTypes.DATE.getIdentifier().equals(identifier)) { + return TypeFactory.createSimpleType(TypeKind.TYPE_DATE); + } else if (TimeType.IDENTIFIER.equals(identifier)) { + return TypeFactory.createSimpleType(TypeKind.TYPE_TIME); + } else { + throw new UnsupportedOperationException("Unknown Beam logical type: " + identifier); + } + } + + // Value conversion: Beam => ZetaSQL public static Value javaObjectToZetaSqlValue(Object object, FieldType fieldType) { if (object == null) { return Value.createNullValue(beamFieldTypeToZetaSqlType(fieldType)); @@ -153,9 +160,11 @@ public static Value javaObjectToZetaSqlValue(Object object, FieldType fieldType) (List) object, fieldType.getCollectionElementType()); case ROW: return beamRowToZetaSqlStructValue((Row) object, fieldType.getRowSchema()); + case LOGICAL_TYPE: + return beamLogicalObjectToZetaSqlValue(fieldType.getLogicalType().getIdentifier(), object); default: throw new UnsupportedOperationException( - "Unsupported Beam fieldType: " + fieldType.getTypeName()); + "Unknown Beam fieldType: " + fieldType.getTypeName()); } } @@ -173,8 +182,7 @@ private static Value javaListToZetaSqlArrayValue(List elements, FieldTyp elements.stream() .map(e -> javaObjectToZetaSqlValue(e, elementType)) .collect(Collectors.toList()); - return Value.createArrayValue( - createZetaSqlArrayTypeFromBeamElementFieldType(elementType), values); + return Value.createArrayValue(beamElementFieldTypeToZetaSqlArrayType(elementType), values); } public static Value beamRowToZetaSqlStructValue(Row row, Schema schema) { @@ -185,9 +193,22 @@ public static Value beamRowToZetaSqlStructValue(Row row, Schema schema) { javaObjectToZetaSqlValue( row.getBaseValue(i, Object.class), schema.getField(i).getType())); } - return Value.createStructValue(createZetaSqlStructTypeFromBeamSchema(schema), values); + return Value.createStructValue(beamSchemaToZetaSqlStructType(schema), values); + } + + private static Value beamLogicalObjectToZetaSqlValue(String identifier, Object object) { + if (SqlTypes.DATE.getIdentifier().equals(identifier)) { + if (object instanceof Long) { // base type + return Value.createDateValue(((Long) object).intValue()); + } else { // input type + return Value.createDateValue((int) ((LocalDate) object).toEpochDay()); + } + } else { + throw new UnsupportedOperationException("Unknown Beam logical type: " + identifier); + } } + // Value conversion: ZetaSQL => Beam public static Object zetaSqlValueToJavaObject( Value value, FieldType fieldType, boolean verifyValues) { if (value.isNull()) { @@ -218,9 +239,11 @@ public static Object zetaSqlValueToJavaObject( value, fieldType.getCollectionElementType(), verifyValues); case ROW: return zetaSqlStructValueToBeamRow(value, fieldType.getRowSchema(), verifyValues); + case LOGICAL_TYPE: + return zetaSqlValueToBeamLogicalObject(fieldType.getLogicalType().getIdentifier(), value); default: throw new UnsupportedOperationException( - "Unsupported Beam fieldType: " + fieldType.getTypeName()); + "Unknown Beam fieldType: " + fieldType.getTypeName()); } } @@ -250,4 +273,12 @@ private static Row zetaSqlStructValueToBeamRow( : Row.withSchema(schema).attachValues(objects); return row; } + + private static Object zetaSqlValueToBeamLogicalObject(String identifier, Value value) { + if (SqlTypes.DATE.getIdentifier().equals(identifier)) { + return LocalDate.ofEpochDay(value.getDateValue()); + } else { + throw new UnsupportedOperationException("Unknown Beam logical type: " + identifier); + } + } } diff --git a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/translation/ExpressionConverter.java b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/translation/ExpressionConverter.java index 788e1a4dfce60..5ebd2e7c9cbda 100644 --- a/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/translation/ExpressionConverter.java +++ b/sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/translation/ExpressionConverter.java @@ -798,7 +798,7 @@ private RexNode convertResolvedFunctionCall( if (returnType != null) { op = SqlOperators.createSimpleSqlFunction( - funName, ZetaSqlUtils.zetaSqlTypeToCalciteType(returnType.getKind())); + funName, ZetaSqlUtils.zetaSqlTypeToCalciteTypeName(returnType.getKind())); } else { throw new UnsupportedOperationException("Does not support ZetaSQL function: " + funName); } diff --git a/sdks/java/extensions/sql/zetasql/src/test/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLDialectSpecTest.java b/sdks/java/extensions/sql/zetasql/src/test/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLDialectSpecTest.java index b3827985c5715..6418dabe47688 100644 --- a/sdks/java/extensions/sql/zetasql/src/test/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLDialectSpecTest.java +++ b/sdks/java/extensions/sql/zetasql/src/test/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSQLDialectSpecTest.java @@ -36,13 +36,13 @@ import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TABLE_FOR_CASE_WHEN; import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TABLE_WITH_ARRAY; import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TABLE_WITH_ARRAY_FOR_UNNEST; +import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TABLE_WITH_DATE; import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TABLE_WITH_MAP; import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TABLE_WITH_STRUCT; import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TABLE_WITH_STRUCT_TIMESTAMP_STRING; import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TABLE_WITH_STRUCT_TWO; import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TIMESTAMP_TABLE_ONE; import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TIMESTAMP_TABLE_TWO; -import static org.apache.beam.sdk.extensions.sql.zetasql.TestInput.TIME_TABLE; import static org.apache.beam.sdk.schemas.Schema.FieldType.DATETIME; import static org.junit.Assert.assertTrue; @@ -55,6 +55,7 @@ import com.google.zetasql.ZetaSQLType.TypeKind; import com.google.zetasql.ZetaSQLValue.ValueProto; import java.nio.charset.StandardCharsets; +import java.time.LocalDate; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; @@ -74,6 +75,7 @@ import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.Field; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.values.PCollection; @@ -2376,6 +2378,285 @@ public void testZetaSQLNestedQueryFive() { pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); } + ///////////////////////////////////////////////////////////////////////////// + // DATE type tests + ///////////////////////////////////////////////////////////////////////////// + + @Test + public void testDateLiteral() { + String sql = "SELECT DATE '2020-3-30'"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addLogicalTypeField("f_date", SqlTypes.DATE).build()) + .addValues(LocalDate.of(2020, 3, 30)) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateColumn() { + String sql = "SELECT FORMAT_DATE('%b-%d-%Y', date_field) FROM table_with_date"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addStringField("f_date_str").build()) + .addValues("Dec-25-2008") + .build(), + Row.withSchema(Schema.builder().addStringField("f_date_str").build()) + .addValues("Apr-07-2020") + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + // TODO[BEAM-9166]: Add a test for CURRENT_DATE function ("SELECT CURRENT_DATE()") + + @Test + public void testExtractDate() { + String sql = + "WITH Dates AS (\n" + + " SELECT DATE '2015-12-31' AS date UNION ALL\n" + + " SELECT DATE '2016-01-01'\n" + + ")\n" + + "SELECT\n" + + " EXTRACT(ISOYEAR FROM date) AS isoyear,\n" + + " EXTRACT(YEAR FROM date) AS year,\n" + + " EXTRACT(ISOWEEK FROM date) AS isoweek,\n" + // TODO[BEAM-9178]: Add tests for DATE_TRUNC and EXTRACT with "week with weekday" date + // parts once they are supported + // + " EXTRACT(WEEK FROM date) AS week,\n" + + " EXTRACT(MONTH FROM date) AS month\n" + + "FROM Dates\n"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + final Schema schema = + Schema.builder() + .addField("isoyear", FieldType.INT64) + .addField("year", FieldType.INT64) + .addField("isoweek", FieldType.INT64) + // .addField("week", FieldType.INT64) + .addField("month", FieldType.INT64) + .build(); + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(schema).addValues(2015L, 2015L, 53L /* , 52L */, 12L).build(), + Row.withSchema(schema).addValues(2015L, 2016L, 53L /* , 0L */, 1L).build()); + + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateFromYearMonthDay() { + String sql = "SELECT DATE(2008, 12, 25)"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addLogicalTypeField("f_date", SqlTypes.DATE).build()) + .addValues(LocalDate.of(2008, 12, 25)) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateFromTimestamp() { + String sql = "SELECT DATE(TIMESTAMP '2016-12-25 05:30:00+07', 'America/Los_Angeles')"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addLogicalTypeField("f_date", SqlTypes.DATE).build()) + .addValues(LocalDate.of(2016, 12, 24)) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateAdd() { + String sql = + "SELECT " + + "DATE_ADD(DATE '2008-12-25', INTERVAL 5 DAY), " + + "DATE_ADD(DATE '2008-12-25', INTERVAL 1 MONTH), " + + "DATE_ADD(DATE '2008-12-25', INTERVAL 1 YEAR), "; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema( + Schema.builder() + .addLogicalTypeField("f_date1", SqlTypes.DATE) + .addLogicalTypeField("f_date2", SqlTypes.DATE) + .addLogicalTypeField("f_date3", SqlTypes.DATE) + .build()) + .addValues( + LocalDate.of(2008, 12, 30), + LocalDate.of(2009, 1, 25), + LocalDate.of(2009, 12, 25)) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateSub() { + String sql = + "SELECT " + + "DATE_SUB(DATE '2008-12-25', INTERVAL 5 DAY), " + + "DATE_SUB(DATE '2008-12-25', INTERVAL 1 MONTH), " + + "DATE_SUB(DATE '2008-12-25', INTERVAL 1 YEAR), "; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema( + Schema.builder() + .addLogicalTypeField("f_date1", SqlTypes.DATE) + .addLogicalTypeField("f_date2", SqlTypes.DATE) + .addLogicalTypeField("f_date3", SqlTypes.DATE) + .build()) + .addValues( + LocalDate.of(2008, 12, 20), + LocalDate.of(2008, 11, 25), + LocalDate.of(2007, 12, 25)) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateDiff() { + String sql = "SELECT DATE_DIFF(DATE '2010-07-07', DATE '2008-12-25', DAY)"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addInt64Field("f_date_diff").build()) + .addValues(559L) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateDiffNegativeResult() { + String sql = "SELECT DATE_DIFF(DATE '2017-12-17', DATE '2017-12-18', ISOWEEK)"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addInt64Field("f_date_diff").build()) + .addValues(-1L) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateTrunc() { + String sql = "SELECT DATE_TRUNC(DATE '2015-06-15', ISOYEAR)"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema( + Schema.builder().addLogicalTypeField("f_date_trunc", SqlTypes.DATE).build()) + .addValues(LocalDate.of(2014, 12, 29)) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testFormatDate() { + String sql = "SELECT FORMAT_DATE('%b-%d-%Y', DATE '2008-12-25')"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addStringField("f_date_str").build()) + .addValues("Dec-25-2008") + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testParseDate() { + String sql = "SELECT PARSE_DATE('%m %d %y', '10 14 18')"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addLogicalTypeField("f_date", SqlTypes.DATE).build()) + .addValues(LocalDate.of(2018, 10, 14)) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateToUnixInt64() { + String sql = "SELECT UNIX_DATE(DATE '2008-12-25')"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addInt64Field("f_unix_date").build()) + .addValues(14238L) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + + @Test + public void testDateFromUnixInt64() { + String sql = "SELECT DATE_FROM_UNIX_DATE(14238)"; + + ZetaSQLQueryPlanner zetaSQLQueryPlanner = new ZetaSQLQueryPlanner(config); + BeamRelNode beamRelNode = zetaSQLQueryPlanner.convertToBeamRel(sql); + PCollection stream = BeamSqlRelUtils.toPCollection(pipeline, beamRelNode); + + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema(Schema.builder().addLogicalTypeField("f_date", SqlTypes.DATE).build()) + .addValues(LocalDate.of(2008, 12, 25)) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(PIPELINE_EXECUTION_WAITTIME_MINUTES)); + } + ///////////////////////////////////////////////////////////////////////////// // TIMESTAMP type tests ///////////////////////////////////////////////////////////////////////////// @@ -2509,7 +2790,7 @@ public void testExtractTimestamp() { + "SELECT\n" + " EXTRACT(ISOYEAR FROM timestamp) AS isoyear,\n" + " EXTRACT(YEAR FROM timestamp) AS year,\n" - + " EXTRACT(ISOWEEK FROM timestamp) AS week,\n" + + " EXTRACT(ISOWEEK FROM timestamp) AS isoweek,\n" // TODO[BEAM-9178]: Add tests for TIMESTAMP_TRUNC and EXTRACT with "week with weekday" // date parts once they are supported // + " EXTRACT(WEEK FROM timestamp) AS week,\n" @@ -4552,7 +4833,6 @@ private void initializeBeamTableProvider() { testBoundedTableMap.put("aggregate_test_table", AGGREGATE_TABLE_ONE); testBoundedTableMap.put("window_test_table", TIMESTAMP_TABLE_ONE); testBoundedTableMap.put("window_test_table_two", TIMESTAMP_TABLE_TWO); - testBoundedTableMap.put("time_test_table", TIME_TABLE); testBoundedTableMap.put("all_null_table", TABLE_ALL_NULL); testBoundedTableMap.put("table_with_struct", TABLE_WITH_STRUCT); testBoundedTableMap.put("table_with_struct_two", TABLE_WITH_STRUCT_TWO); @@ -4564,6 +4844,7 @@ private void initializeBeamTableProvider() { testBoundedTableMap.put("table_all_types", TABLE_ALL_TYPES); testBoundedTableMap.put("table_all_types_2", TABLE_ALL_TYPES_2); testBoundedTableMap.put("table_with_map", TABLE_WITH_MAP); + testBoundedTableMap.put("table_with_date", TABLE_WITH_DATE); testBoundedTableMap.put("table_with_struct_ts_string", TABLE_WITH_STRUCT_TIMESTAMP_STRING); tableProvider = new ReadOnlyTableProvider("test_table_provider", testBoundedTableMap); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIO.java index 16ed4b5c0c0dd..69f438055da9b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIO.java @@ -953,9 +953,23 @@ public SpannerWriteResult expand(PCollection input) { @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); + populateDisplayDataWithParamaters(builder); + } + + private void populateDisplayDataWithParamaters(DisplayData.Builder builder) { getSpannerConfig().populateDisplayData(builder); builder.add( - DisplayData.item("batchSizeBytes", getBatchSizeBytes()).withLabel("Batch Size in Bytes")); + DisplayData.item("batchSizeBytes", getBatchSizeBytes()) + .withLabel("Max batch size in bytes")); + builder.add( + DisplayData.item("maxNumMutations", getMaxNumMutations()) + .withLabel("Max number of mutated cells in each batch")); + builder.add( + DisplayData.item("maxNumRows", getMaxNumRows()) + .withLabel("Max number of rows in each batch")); + builder.add( + DisplayData.item("groupingFactor", getGroupingFactor()) + .withLabel("Number of batches to sort over")); } } @@ -992,6 +1006,12 @@ public WriteGrouped(Write spec) { this.spec = spec; } + @Override + public void populateDisplayData(DisplayData.Builder builder) { + super.populateDisplayData(builder); + spec.populateDisplayDataWithParamaters(builder); + } + @Override public SpannerWriteResult expand(PCollection input) { diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIOWriteTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIOWriteTest.java index 6129882b4cf70..aaca469105c4e 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIOWriteTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIOWriteTest.java @@ -589,20 +589,50 @@ public void retryOnAbortedAndDeadlineExceeded() throws InterruptedException { } @Test - public void displayData() throws Exception { + public void displayDataWrite() throws Exception { SpannerIO.Write write = SpannerIO.write() .withProjectId("test-project") .withInstanceId("test-instance") .withDatabaseId("test-database") - .withBatchSizeBytes(123); + .withBatchSizeBytes(123) + .withMaxNumMutations(456) + .withMaxNumRows(789) + .withGroupingFactor(100); DisplayData data = DisplayData.from(write); - assertThat(data.items(), hasSize(4)); + assertThat(data.items(), hasSize(7)); assertThat(data, hasDisplayItem("projectId", "test-project")); assertThat(data, hasDisplayItem("instanceId", "test-instance")); assertThat(data, hasDisplayItem("databaseId", "test-database")); assertThat(data, hasDisplayItem("batchSizeBytes", 123)); + assertThat(data, hasDisplayItem("maxNumMutations", 456)); + assertThat(data, hasDisplayItem("maxNumRows", 789)); + assertThat(data, hasDisplayItem("groupingFactor", 100)); + } + + @Test + public void displayDataWriteGrouped() throws Exception { + SpannerIO.WriteGrouped writeGrouped = + SpannerIO.write() + .withProjectId("test-project") + .withInstanceId("test-instance") + .withDatabaseId("test-database") + .withBatchSizeBytes(123) + .withMaxNumMutations(456) + .withMaxNumRows(789) + .withGroupingFactor(100) + .grouped(); + + DisplayData data = DisplayData.from(writeGrouped); + assertThat(data.items(), hasSize(7)); + assertThat(data, hasDisplayItem("projectId", "test-project")); + assertThat(data, hasDisplayItem("instanceId", "test-instance")); + assertThat(data, hasDisplayItem("databaseId", "test-database")); + assertThat(data, hasDisplayItem("batchSizeBytes", 123)); + assertThat(data, hasDisplayItem("maxNumMutations", 456)); + assertThat(data, hasDisplayItem("maxNumRows", 789)); + assertThat(data, hasDisplayItem("groupingFactor", 100)); } @Test diff --git a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py index 07c5f8809fc7d..02be0062296f7 100644 --- a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py +++ b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py @@ -1021,7 +1021,7 @@ def run_ParDo(self, transform_node, options): # will be 'None' for main output and '' for a tagged output. outputs = [] - all_output_tags = transform_proto.outputs.keys() + all_output_tags = list(transform_proto.outputs.keys()) # Some external transforms require output tags to not be modified. # So we randomly select one of the output tags as the main output and diff --git a/sdks/python/apache_beam/runners/portability/abstract_job_service.py b/sdks/python/apache_beam/runners/portability/abstract_job_service.py index d1bc313566fa3..f1c26c71ae3d5 100644 --- a/sdks/python/apache_beam/runners/portability/abstract_job_service.py +++ b/sdks/python/apache_beam/runners/portability/abstract_job_service.py @@ -18,6 +18,7 @@ from __future__ import absolute_import +import copy import itertools import json import logging @@ -46,7 +47,9 @@ from apache_beam.utils.timestamp import Timestamp if TYPE_CHECKING: - from google.protobuf import struct_pb2 # pylint: disable=ungrouped-imports + # pylint: disable=ungrouped-imports + from typing import BinaryIO + from google.protobuf import struct_pb2 from apache_beam.portability.api import beam_runner_api_pb2 _LOGGER = logging.getLogger(__name__) @@ -278,6 +281,27 @@ def to_runner_api(self): state=self.state) +class JarArtifactManager(object): + def __init__(self, jar_path, root): + self._root = root + self._zipfile_handle = zipfile.ZipFile(jar_path, 'a') + + def close(self): + self._zipfile_handle.close() + + def file_writer(self, path): + # type: (str) -> Tuple[BinaryIO, str] + + """Given a relative path, returns an open handle that can be written to + and an reference that can later be used to read this file.""" + full_path = '%s/%s' % (self._root, path) + return self._zipfile_handle.open( + full_path, 'w', force_zip64=True), 'classpath://%s' % full_path + + def zipfile_handle(self): + return self._zipfile_handle + + class UberJarBeamJob(AbstractBeamJob): """Abstract baseclass for creating a Beam job. The resulting job will be packaged and run in an executable uber jar.""" @@ -291,8 +315,6 @@ class UberJarBeamJob(AbstractBeamJob): PIPELINE_PATH = '/'.join([PIPELINE_FOLDER, PIPELINE_NAME, "pipeline.json"]) PIPELINE_OPTIONS_PATH = '/'.join( [PIPELINE_FOLDER, PIPELINE_NAME, 'pipeline-options.json']) - ARTIFACT_MANIFEST_PATH = '/'.join( - [PIPELINE_FOLDER, PIPELINE_NAME, 'artifact-manifest.json']) ARTIFACT_FOLDER = '/'.join([PIPELINE_FOLDER, PIPELINE_NAME, 'artifacts']) def __init__( @@ -313,27 +335,23 @@ def prepare(self): with tempfile.NamedTemporaryFile(suffix='.jar') as tout: self._jar = tout.name shutil.copy(self._executable_jar, self._jar) - with zipfile.ZipFile(self._jar, 'a', compression=zipfile.ZIP_DEFLATED) as z: - with z.open(self.PIPELINE_PATH, 'w') as fout: - fout.write( - json_format.MessageToJson(self._pipeline_proto).encode('utf-8')) - with z.open(self.PIPELINE_OPTIONS_PATH, 'w') as fout: - fout.write( - json_format.MessageToJson(self._pipeline_options).encode('utf-8')) - with z.open(self.PIPELINE_MANIFEST, 'w') as fout: - fout.write( - json.dumps({ - 'defaultJobName': self.PIPELINE_NAME - }).encode('utf-8')) self._start_artifact_service(self._jar, self._artifact_port) def _start_artifact_service(self, jar, requested_port): - self._artifact_staging_service = artifact_service.ZipFileArtifactService( - jar, self.ARTIFACT_FOLDER) + self._artifact_manager = JarArtifactManager(self._jar, self.ARTIFACT_FOLDER) + self._artifact_staging_service = artifact_service.ArtifactStagingService( + self._artifact_manager.file_writer) + self._artifact_staging_service.register_job( + self._job_id, + { + env_id: env.dependencies + for (env_id, + env) in self._pipeline_proto.components.environments.items() + }) self._artifact_staging_server = grpc.server(futures.ThreadPoolExecutor()) port = self._artifact_staging_server.add_insecure_port( '[::]:%s' % requested_port) - beam_artifact_api_pb2_grpc.add_LegacyArtifactStagingServiceServicer_to_server( + beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server( self._artifact_staging_service, self._artifact_staging_server) self._artifact_staging_endpoint = endpoints_pb2.ApiServiceDescriptor( url='localhost:%d' % port) @@ -343,9 +361,34 @@ def _start_artifact_service(self, jar, requested_port): def _stop_artifact_service(self): self._artifact_staging_server.stop(1) - self._artifact_staging_service.close() - self._artifact_manifest_location = ( - self._artifact_staging_service.retrieval_token(self._job_id)) + + # Update dependencies to point to staged files. + pipeline = copy.copy(self._pipeline_proto) + if any(env.dependencies + for env in pipeline.components.environments.values()): + for env_id, deps in self._artifact_staging_service.resolved_deps( + self._job_id).items(): + # Slice assignment not supported for repeated fields. + env = self._pipeline_proto.components.environments[env_id] + del env.dependencies[:] + env.dependencies.extend(deps) + + # Copy the pipeline definition and metadata into the jar. + z = self._artifact_manager.zipfile_handle() + with z.open(self.PIPELINE_PATH, 'w') as fout: + fout.write( + json_format.MessageToJson(self._pipeline_proto).encode('utf-8')) + with z.open(self.PIPELINE_OPTIONS_PATH, 'w') as fout: + fout.write( + json_format.MessageToJson(self._pipeline_options).encode('utf-8')) + with z.open(self.PIPELINE_MANIFEST, 'w') as fout: + fout.write( + json.dumps({ + 'defaultJobName': self.PIPELINE_NAME + }).encode('utf-8')) + + # Closes the jar file. + self._artifact_manager.close() def artifact_staging_endpoint(self): return self._artifact_staging_endpoint diff --git a/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py b/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py index 77b6292cc8b7c..4fa92cddc55ab 100644 --- a/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py +++ b/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py @@ -27,7 +27,6 @@ import tempfile import time import urllib -import zipfile import requests from google.protobuf import json_format @@ -146,12 +145,6 @@ def delete(self, path, **kwargs): def run(self): self._stop_artifact_service() - # Move the artifact manifest to the expected location. - with zipfile.ZipFile(self._jar, 'a', compression=zipfile.ZIP_DEFLATED) as z: - with z.open(self._artifact_manifest_location) as fin: - manifest_contents = fin.read() - with z.open(self.ARTIFACT_MANIFEST_PATH, 'w') as fout: - fout.write(manifest_contents) # Upload the jar and start the job. with open(self._jar, 'rb') as jar_file: diff --git a/sdks/python/apache_beam/runners/portability/local_job_service_test.py b/sdks/python/apache_beam/runners/portability/local_job_service_test.py index f1b17ab7069a4..ec164e0e8ffda 100644 --- a/sdks/python/apache_beam/runners/portability/local_job_service_test.py +++ b/sdks/python/apache_beam/runners/portability/local_job_service_test.py @@ -22,10 +22,6 @@ import logging import unittest -import grpc - -from apache_beam.portability.api import beam_artifact_api_pb2 -from apache_beam.portability.api import beam_artifact_api_pb2_grpc from apache_beam.portability.api import beam_job_api_pb2 from apache_beam.portability.api import beam_runner_api_pb2 from apache_beam.runners.portability import local_job_service @@ -41,17 +37,6 @@ def __init__(self, job_service): def get_pipeline_options(self): return None - def stage(self, pipeline, artifact_staging_endpoint, staging_session_token): - channel = grpc.insecure_channel(artifact_staging_endpoint) - staging_stub = beam_artifact_api_pb2_grpc.LegacyArtifactStagingServiceStub( - channel) - manifest_response = staging_stub.CommitManifest( - beam_artifact_api_pb2.CommitManifestRequest( - staging_session_token=staging_session_token, - manifest=beam_artifact_api_pb2.Manifest())) - channel.close() - return manifest_response.retrieval_token - class LocalJobServerTest(unittest.TestCase): def test_end_to_end(self): diff --git a/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server.py b/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server.py index 7d3ada269948a..252f70a727977 100644 --- a/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server.py +++ b/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server.py @@ -183,12 +183,6 @@ def _create_submission_request(self, jar, job_name): def run(self): self._stop_artifact_service() - # Move the artifact manifest to the expected location. - with zipfile.ZipFile(self._jar, 'a', compression=zipfile.ZIP_DEFLATED) as z: - with z.open(self._artifact_manifest_location) as fin: - manifest_contents = fin.read() - with z.open(self.ARTIFACT_MANIFEST_PATH, 'w') as fout: - fout.write(manifest_contents) # Upload the jar and start the job. self._spark_submission_id = self.post( diff --git a/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server_test.py b/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server_test.py index b08d5b8a903a9..3ae25d4169f78 100644 --- a/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server_test.py +++ b/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server_test.py @@ -28,18 +28,16 @@ import zipfile import freezegun -import grpc import requests_mock from apache_beam.options import pipeline_options from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import SparkRunnerOptions -from apache_beam.portability.api import beam_artifact_api_pb2 -from apache_beam.portability.api import beam_artifact_api_pb2_grpc from apache_beam.portability.api import beam_job_api_pb2 from apache_beam.portability.api import beam_runner_api_pb2 from apache_beam.runners.portability import spark_runner from apache_beam.runners.portability import spark_uber_jar_job_server +from apache_beam.runners.portability.local_job_service_test import TestJobServicePlan @contextlib.contextmanager @@ -135,17 +133,14 @@ def spark_submission_status_response(state): 'http://host:6066', options) # Prepare the job. - prepare_response = job_server.Prepare( - beam_job_api_pb2.PrepareJobRequest( - job_name='job', pipeline=beam_runner_api_pb2.Pipeline())) - channel = grpc.insecure_channel( - prepare_response.artifact_staging_endpoint.url) - retrieval_token = beam_artifact_api_pb2_grpc.LegacyArtifactStagingServiceStub( - channel).CommitManifest( - beam_artifact_api_pb2.CommitManifestRequest( - staging_session_token=prepare_response.staging_session_token, - manifest=beam_artifact_api_pb2.Manifest())).retrieval_token - channel.close() + plan = TestJobServicePlan(job_server) + + # Prepare the job. + prepare_response = plan.prepare(beam_runner_api_pb2.Pipeline()) + retrieval_token = plan.stage( + beam_runner_api_pb2.Pipeline(), + prepare_response.artifact_staging_endpoint.url, + prepare_response.staging_session_token) # Now actually run the job. http_mock.post( diff --git a/sdks/python/setup.py b/sdks/python/setup.py index e9c6e57e0672b..f4d9434e5c1bc 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -153,7 +153,7 @@ def get_version(): 'futures>=3.2.0,<4.0.0; python_version < "3.0"', 'grpcio>=1.12.1,<2', 'hdfs>=2.1.0,<3.0.0', - 'httplib2>=0.8,<0.16.0', + 'httplib2>=0.8,<0.18.0', 'mock>=1.0.1,<3.0.0', 'numpy>=1.14.3,<2', 'pymongo>=3.8.0,<4.0.0', @@ -197,7 +197,7 @@ def get_version(): GCP_REQUIREMENTS = [ 'cachetools>=3.1.0,<4', - 'google-apitools>=0.5.28,<0.5.29', + 'google-apitools>=0.5.31,<0.5.32', 'google-cloud-datastore>=1.7.1,<1.8.0', 'google-cloud-pubsub>=0.39.0,<1.1.0', # GCP packages required by tests diff --git a/website/www/site/layouts/shortcodes/flink_java_pipeline_options.html b/website/www/site/layouts/shortcodes/flink_java_pipeline_options.html index b310b26181c36..317b06141c37a 100644 --- a/website/www/site/layouts/shortcodes/flink_java_pipeline_options.html +++ b/website/www/site/layouts/shortcodes/flink_java_pipeline_options.html @@ -1,20 +1,21 @@ -{{/* + + @@ -76,6 +77,11 @@ + + + + + diff --git a/website/www/site/layouts/shortcodes/flink_python_pipeline_options.html b/website/www/site/layouts/shortcodes/flink_python_pipeline_options.html index 4440ec0478e26..4170e25c543ce 100644 --- a/website/www/site/layouts/shortcodes/flink_python_pipeline_options.html +++ b/website/www/site/layouts/shortcodes/flink_python_pipeline_options.html @@ -1,20 +1,21 @@ -{{/* - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. See accompanying LICENSE file. -*/}} -{{/* -This is an auto-generated file. -Use generatePipelineOptionsTableJava and generatePipelineOptionsTablePython respectively -which should be called before running the tests. -*/}} + +
allowNonRestoredStateJar-Files to send to all workers and put on the classpath. The default value is all files from the classpath.
finishBundleBeforeCheckpointingIf set, finishes the current bundle and flushes all output before checkpointing the state of the operators. By default, starts checkpointing immediately and buffers any remaining bundle output as part of the checkpoint. The setting may affect the checkpoint alignment.Default: false
flinkMaster Address of the Flink Master where the Pipeline should be executed. Can either be of the form "host:port" or one of the special values [local], [collection] or [auto].
@@ -76,6 +77,11 @@ + + + + +
allow_non_restored_stateJar-Files to send to all workers and put on the classpath. The default value is all files from the classpath.
finish_bundle_before_checkpointingIf set, finishes the current bundle and flushes all output before checkpointing the state of the operators. By default, starts checkpointing immediately and buffers any remaining bundle output as part of the checkpoint. The setting may affect the checkpoint alignment.Default: false
flink_master Address of the Flink Master where the Pipeline should be executed. Can either be of the form "host:port" or one of the special values [local], [collection] or [auto].