[FLINK-3059] Improve JavaDocs for DataSet.writeAsText()

Currently the JavaDocs of writeAsText() state it simply generates a file, but this is not always true and it depends on the environment configuration. This commit improves the JavaDocs of writeAsText(). This closes apache#1392
nltran · Nov 25, 2015 · e03e60d · e03e60d
1 parent c787a03
commit e03e60d
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 11 deletions.
diff --git a/flink-java/src/main/java/org/apache/flink/api/java/DataSet.java b/flink-java/src/main/java/org/apache/flink/api/java/DataSet.java
@@ -1343,10 +1343,49 @@ public SortPartitionOperator<T> sortPartition(String field, Order order) {
  // --------------------------------------------------------------------------------------------
 
  /**
- * Writes a DataSet as a text file to the specified location.<br>
- * For each element of the DataSet the result of {@link Object#toString()} is written. 
+ * Writes a DataSet as text file(s) to the specified location.<br>
+ * For each element of the DataSet the result of {@link Object#toString()} is written.<br/>
+ * <br/>
+ * <span class="strong">Output files and directories</span><br/>
+ * What output how writeAsText() method produces is depending on other circumstance
+ * <ul>
+ * <li>
+ * A directory is created and multiple files are written underneath. (Default behavior)<br/>
+ * This sink creates a directory called "path1", and files "1", "2" ... are writen underneath depending on <a href="https://flink.apache.org/faq.html#what-is-the-parallelism-how-do-i-set-it">parallelism</a>
+ * <pre>{@code .
+ * └── path1/
+ * ├── 1
+ * ├── 2
+ * └── ...}</pre>
+ * Code Example
+ * <pre>{@code dataset.writeAsText("file:https:///path1");}</pre>
+ * </li>
+ * <li>
+ * A single file called "path1" is created when parallelism is set to 1
+ * <pre>{@code .
+ * └── path1 }</pre>
+ * Code Example
+ * <pre>{@code // Parallelism is set to only this particular operation
+ *dataset.writeAsText("file:https:///path1").setParallelism(1);
+ *
+ * // This will creates the same effect but note all operators' parallelism are set to one 
+ *env.setParallelism(1); 
+ *...
+ *dataset.writeAsText("file:https:///path1"); }</pre>
+ * </li>
+ * <li>
+ * A directory is always created when <a href="https://ci.apache.org/projects/flink/flink-docs-master/setup/config.html#file-systems">fs.output.always-create-directory</a>
+ * is set to true in flink-conf.yaml file, even when parallelism is set to 1.
+ * <pre>{@code .
+ * └── path1/
+ * └── 1 }</pre>
+ * Code Example
+ * <pre>{@code // fs.output.always-create-directory = true
+ *dataset.writeAsText("file:https:///path1").setParallelism(1); }</pre>
+ * </li>
+ * </ul>
  * 
- * @param filePath The path pointing to the location the text file is written to.
+ * @param filePath The path pointing to the location the text file or files under the directory is written to.
  * @return The DataSink that writes the DataSet.
  * 
  * @see TextOutputFormat
@@ -1356,14 +1395,15 @@ public DataSink<T> writeAsText(String filePath) {
  }
 
  /**
- * Writes a DataSet as a text file to the specified location.<br>
+ * Writes a DataSet as text file(s) to the specified location.<br>
  * For each element of the DataSet the result of {@link Object#toString()} is written. 
  * 
  * @param filePath The path pointing to the location the text file is written to.
  * @param writeMode Control the behavior for existing files. Options are NO_OVERWRITE and OVERWRITE.
  * @return The DataSink that writes the DataSet.
  * 
  * @see TextOutputFormat
+ * @see DataSet#writeAsText(String) Output files and directories
  */
  public DataSink<T> writeAsText(String filePath, WriteMode writeMode) {
  TextOutputFormat<T> tof = new TextOutputFormat<T>(new Path(filePath));
@@ -1372,21 +1412,22 @@ public DataSink<T> writeAsText(String filePath, WriteMode writeMode) {
  }
 
  /**
- * Writes a DataSet as a text file to the specified location.<br>
+ * Writes a DataSet as text file(s) to the specified location.<br>
  * For each element of the DataSet the result of {@link TextFormatter#format(Object)} is written.
  *
  * @param filePath The path pointing to the location the text file is written to.
  * @param formatter formatter that is applied on every element of the DataSet.
  * @return The DataSink that writes the DataSet.
  *
  * @see TextOutputFormat
+ * @see DataSet#writeAsText(String) Output files and directories
  */
  public DataSink<String> writeAsFormattedText(String filePath, TextFormatter<T> formatter) {
  return map(new FormattingMapper<T>(clean(formatter))).writeAsText(filePath);
  }
 
  /**
- * Writes a DataSet as a text file to the specified location.<br>
+ * Writes a DataSet as text file(s) to the specified location.<br>
  * For each element of the DataSet the result of {@link TextFormatter#format(Object)} is written.
  *
  * @param filePath The path pointing to the location the text file is written to.
@@ -1395,13 +1436,14 @@ public DataSink<String> writeAsFormattedText(String filePath, TextFormatter<T> f
  * @return The DataSink that writes the DataSet.
  *
  * @see TextOutputFormat
+ * @see DataSet#writeAsText(String) Output files and directories
  */
  public DataSink<String> writeAsFormattedText(String filePath, WriteMode writeMode, TextFormatter<T> formatter) {
  return map(new FormattingMapper<T>(clean(formatter))).writeAsText(filePath, writeMode);
  }
 
  /**
- * Writes a {@link Tuple} DataSet as a CSV file to the specified location.<br>
+ * Writes a {@link Tuple} DataSet as CSV file(s) to the specified location.<br>
  * <b>Note: Only a Tuple DataSet can written as a CSV file.</b><br>
  * For each Tuple field the result of {@link Object#toString()} is written.
  * Tuple fields are separated by the default field delimiter {@code "comma" (,)}.<br>
@@ -1412,13 +1454,14 @@ public DataSink<String> writeAsFormattedText(String filePath, WriteMode writeMod
  * 
  * @see Tuple
  * @see CsvOutputFormat
+ * @see DataSet#writeAsText(String) Output files and directories 
  */
  public DataSink<T> writeAsCsv(String filePath) {
  return writeAsCsv(filePath, CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER);
  }
 
  /**
- * Writes a {@link Tuple} DataSet as a CSV file to the specified location.<br>
+ * Writes a {@link Tuple} DataSet as CSV file(s) to the specified location.<br>
  * <b>Note: Only a Tuple DataSet can written as a CSV file.</b><br>
  * For each Tuple field the result of {@link Object#toString()} is written.
  * Tuple fields are separated by the default field delimiter {@code "comma" (,)}.<br>
@@ -1430,13 +1473,14 @@ public DataSink<T> writeAsCsv(String filePath) {
  * 
  * @see Tuple
  * @see CsvOutputFormat
+ * @see DataSet#writeAsText(String) Output files and directories
  */
  public DataSink<T> writeAsCsv(String filePath, WriteMode writeMode) {
  return internalWriteAsCsv(new Path(filePath),CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER, writeMode);
  }
 
  /**
- * Writes a {@link Tuple} DataSet as a CSV file to the specified location with the specified field and line delimiters.<br>
+ * Writes a {@link Tuple} DataSet as CSV file(s) to the specified location with the specified field and line delimiters.<br>
  * <b>Note: Only a Tuple DataSet can written as a CSV file.</b><br>
  * For each Tuple field the result of {@link Object#toString()} is written.
  * 
@@ -1446,13 +1490,14 @@ public DataSink<T> writeAsCsv(String filePath, WriteMode writeMode) {
  * 
  * @see Tuple
  * @see CsvOutputFormat
+ * @see DataSet#writeAsText(String) Output files and directories
  */
  public DataSink<T> writeAsCsv(String filePath, String rowDelimiter, String fieldDelimiter) {
  return internalWriteAsCsv(new Path(filePath), rowDelimiter, fieldDelimiter, null);
  }
 
  /**
- * Writes a {@link Tuple} DataSet as a CSV file to the specified location with the specified field and line delimiters.<br>
+ * Writes a {@link Tuple} DataSet as CSV file(s) to the specified location with the specified field and line delimiters.<br>
  * <b>Note: Only a Tuple DataSet can written as a CSV file.</b><br>
 § * For each Tuple field the result of {@link Object#toString()} is written.
  * 
@@ -1463,6 +1508,7 @@ public DataSink<T> writeAsCsv(String filePath, String rowDelimiter, String field
  * 
  * @see Tuple
  * @see CsvOutputFormat
+ * @see DataSet#writeAsText(String) Output files and directories
  */
  public DataSink<T> writeAsCsv(String filePath, String rowDelimiter, String fieldDelimiter, WriteMode writeMode) {
  return internalWriteAsCsv(new Path(filePath), rowDelimiter, fieldDelimiter, writeMode);

diff --git a/flink-scala/src/main/scala/org/apache/flink/api/scala/DataSet.scala b/flink-scala/src/main/scala/org/apache/flink/api/scala/DataSet.scala
@@ -1461,6 +1461,7 @@ class DataSet[T: ClassTag](set: JavaDataSet[T]) {
  /**
  * Writes `this` DataSet to the specified location. This uses [[AnyRef.toString]] on
  * each element.
+ * @see org.apache.flink.api.java.DataSet#writeAsText(String)
  */
  def writeAsText(
  filePath: String,
@@ -1473,9 +1474,10 @@ class DataSet[T: ClassTag](set: JavaDataSet[T]) {
  }
 
  /**
- * Writes `this` DataSet to the specified location as a CSV file.
+ * Writes `this` DataSet to the specified location as CSV file(s).
  *
  * This only works on Tuple DataSets. For individual tuple fields [[AnyRef.toString]] is used.
+ * @see org.apache.flink.api.java.DataSet#writeAsText(String)
  */
  def writeAsCsv(
  filePath: String,