JuliaData · nalimilan · Oct 29, 2018 · Oct 8, 2018 · Oct 9, 2018 · Oct 9, 2018
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -30,7 +30,7 @@ Depth = 2
 ## API
 
 ```@contents
-Pages = ["lib/types.md", "lib/functions.md"]
+Pages = ["lib/types.md", "lib/functions.md", "lib/indexing.md"]
 Depth = 2
 ```
 

diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md
@@ -0,0 +1,81 @@
+
+```@meta
+CurrentModule = DataFrames
+```
+
+# Indexing
+
+```@index
+Pages = ["indexing.md"]
+```
+
+## General rules
+
+The following rules explain target functionality of how `getindex`, `setindex!`, and `view` are intended to work with `DataFrame`, `SubDataFrame` and `DataFrameRow` objects.
+
+The rules for a valid type of index into a column are the following:
+* a value, later denoted as `col`:
+ * a `Symbol`;
+ * an `Integer` that is not `Bool`;
+* a vector, later denoted as `cols`:
+ * a vector of `Symbol` (does not have to be a subtype of `AbstractVector{Symbol}`);
+ * a vector of `Integer` other than `Bool` (does not have to be a subtype of `AbstractVector{<:Integer}`);
+ * a vector of `Bool` that has to be a subtype of `AbstractVector{Bool}`.
+ * a colon.
+
+The rules for a valid type of index into a row are the following:
+* a value, later denoted as `row`:
+ * an `Integer` that is not `Bool`;
+* a vector, later denoted as `rows`:
+ * a vector of `Integer` other than `Bool` (does not have to be a subtype of `AbstractVector{<:Integer}`);
+ * a vector of `Bool` that has to be a subtype of `AbstractVector{Bool}`;
+ * a colon.
+
+In the descriptions below `df` represents a `DataFrame`, `sdf` is a `SubDataFrame` and `dfr` is a `DataFrameRow`.
+
+## `getindex`
+
+The following list specifies return types of `getindex` operations depending on argument types.
+
+In all operations copying vectors is avoided where possible.
+If it is performed a description explicitly mentions that the data is *copied*.
+
+For performance reasons, accessing, via `getindex` or `view`, a single `row` and multiple `cols` of a `DataFrame`, a `SubDataFrame` or a `DataFrameRow` always returns a `DataFrameRow` (which is a view-like type).
+
+`DataFrame`:
+* `df[col]` -> the vector contained in column `col`;
+* `df[cols]` -> a freshly allocated `DataFrame` containing the vectors contained in columns `cols`;
+* `df[row, col]` -> the value contained in row `row` of column `col`, the same as `df[col][row]`;
+* `df[row, cols]` -> a `DataFrameRow` with parent `df` if `cols` is a colon and `df[cols]` otherwise;
+* `df[rows, col]` -> a copy of the vector `df[col]` with only the entries corresponding to `rows` selected, the same as `df[col][rows]`;
+* `df[rows, cols]` -> a `DataFrame` containing copies of columns `cols` with only the entries corresponding to `rows` selected.
+* `@view df[col]` -> the vector contained in column `col` (this is equivalent to `df[col]`);
+* `@view df[cols]` -> a `SubDataFrame` with parent `df` if `cols` is a colon and `df[cols]` otherwise;
+* `@view df[row, col]` -> a `0`-dimensional view into `df[col]`, the same as `view(df[col], row)`;
+* `@view df[row, cols]` -> a `DataFrameRow` with parent `df` if `cols` is a colon and `df[cols]` otherwise;
+* `@view df[rows, col]` -> a view into `df[col]` with `rows` selected, the same as `view(df[col], rows)`;
+* `@view df[rows, cols]` -> a `SubDataFrame` with `rows` selected with parent `df` if `cols` is a colon and `df[cols]` otherwise.
+
+`SubDataFrame`:
+* `sdf[col]` -> a view of the vector contained in column `col` of `parent(sdf)` with `DataFrames.rows(sdf)` as a selector;
+* `sdf[cols]` -> a `SubDataFrame`, with parent `parent(sdf)` if `cols` is a colon and `parent(sdf)[cols]` otherwise;
+* `sdf[row, col]` -> a value contained in row `row` of column `col`;
+* `sdf[row, cols]` -> a `DataFrameRow` with parent `parent(sdf)` if `cols` is a colon and `parent(sdf)[cols]` otherwise;
+* `sdf[rows, col]` -> a copy of a vector `sdf[col]` with only rows `rows` selected;
+* `sdf[rows, cols]` -> a `DataFrame` containing columns `cols` and `df[rows, col]` as a vector in each `col` in `cols`.
+* `@view sdf[col]` -> a view of vector contained in column `col` of `parent(sdf)` with `DataFrames.rows(sdf)` as selector;
+* `@view sdf[cols]` -> a `SubDataFrame` with parent `parent(sdf)` if `cols` is a colon and `parent(sdf)[cols]` otherwise;
+* `@view sdf[row, col]` -> translates to `view(sdf[col], row)` (a `0`-dimensional view into `df[col]`);
+* `@view sdf[row, cols]` -> a `DataFrameRow` with parent `parent(sdf)` if `cols` is a colon and `parent(sdf)[cols]` otherwise;
+* `@view sdf[rows, col]` -> translates to `view(sdf[col], rows)` (a standard view into `sdf[col]` vector);
+* `@view sdf[rows, cols]` -> a `SubDataFrame` with parent `parent(sdf)` if `cols` is a colon and `sdf[cols]` otherwise.
+
+`DataFrameRow`:
+* `dfr[col]` -> the value contained in column `col` of `dfr`;
+* `dfr[cols]` -> a `DataFrameRow` with parent `parent(dfr)` if `cols` is a colon and `parent(dfr)[cols]` otherwise;
+* `@view dfr[col]` -> a `0`-dimensional view into `parent(dfr)[DataFrames.row(dfr), col]`;
+* `@view dfr[cols]` -> a `DataFrameRow` with parent `parent(dfr)` if `cols` is a colon and `parent(dfr)[cols]` otherwise;
+
+## `setindex!`
+
+Under construction
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -134,7 +134,7 @@ names!(df, [:a, :b, :a], makeunique=true) # renames second :a to :a_1
 """
 function names!(df::AbstractDataFrame, vals; allow_duplicates=false, makeunique::Bool=false)
  if allow_duplicates
- Base.depwarn("Keyword argument allow_duplicates is deprecated. Use makeunique.", :names!)
+ Base.depwarn("Keyword argument allow_duplicates is deprecated. Use `makeunique` keyword argument.", :names!)
  end
  names!(index(df), vals, allow_duplicates=allow_duplicates, makeunique=makeunique)
  return df

diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -240,26 +240,9 @@ ncol(df::DataFrame) = length(index(df))
 ##
 ##############################################################################
 
-# Cases:
-#
-# df[SingleColumnIndex] => AbstractDataVector
-# df[MultiColumnIndex] => DataFrame
-# df[SingleRowIndex, SingleColumnIndex] => Scalar
-# df[SingleRowIndex, MultiColumnIndex] => DataFrame
-# df[MultiRowIndex, SingleColumnIndex] => AbstractVector
-# df[MultiRowIndex, MultiColumnIndex] => DataFrame
-#
-# General Strategy:
-#
-# Let getindex(index(df), col_inds) from Index() handle the resolution
-# of column indices
-# Let getindex(columns(df)[j], row_inds) from AbstractVector() handle
-# the resolution of row indices
-
-# TODO: change Real to Integer in this union after deprecation period
-const ColumnIndex = Union{Real, Symbol}
-
-# df[SingleColumnIndex] => AbstractDataVector
+const ColumnIndex = Union{Integer, Symbol}
+
+# df[SingleColumnIndex] => AbstractVector, the same vector
 function Base.getindex(df::DataFrame, col_ind::ColumnIndex)
  selected_column = index(df)[col_ind]
  return columns(df)[selected_column]
@@ -276,24 +259,35 @@ end
 Base.getindex(df::DataFrame, col_inds::Colon) = copy(df)
 
 # df[SingleRowIndex, SingleColumnIndex] => Scalar
-function Base.getindex(df::DataFrame, row_ind::Real, col_ind::ColumnIndex)
+function Base.getindex(df::DataFrame, row_ind::Integer, col_ind::ColumnIndex)
  selected_column = index(df)[col_ind]
  return columns(df)[selected_column][row_ind]
 end
 
-# df[SingleRowIndex, MultiColumnIndex] => DataFrame
-function Base.getindex(df::DataFrame, row_ind::Bool, col_inds::AbstractVector)
- throw(ArgumentError("invalid row index: $row_ind of type Bool"))
-end
-
-# df[SingleRowIndex, MultiColumnIndex] => DataFrame
-function Base.getindex(df::DataFrame, row_ind::Real, col_inds::AbstractVector)
+# df[SingleRowIndex, MultiColumnIndex] => DataFrame (will be DatFrameRow)
+function Base.getindex(df::DataFrame, row_ind::Integer, col_inds::AbstractVector)
+ if row_ind isa Bool
+ throw(ArgumentError("invalid row index: $row_ind of type Bool"))
+ end
+ Base.depwarn("Selecting a single row from a `DataFrame` will return a `DataFrameRow` in the future. " *
+ "To get a `DataFrame` use `df[row_ind:row_ind, col_inds]`.", :getindex)
  selected_columns = index(df)[col_inds]
- new_columns = Any[dv[[row_ind]] for dv in columns(df)[selected_columns]]
+ new_columns = AbstractVector[[dv[row_ind]] for dv in columns(df)[selected_columns]]
  return DataFrame(new_columns, Index(_names(df)[selected_columns]))
 end
 
-# df[MultiRowIndex, SingleColumnIndex] => AbstractVector
+# df[SingleRowIndex, :] => DataFrame
+function Base.getindex(df::DataFrame, row_ind::Integer, ::Colon)
+ if row_ind isa Bool
+ throw(ArgumentError("invalid row index: $row_ind of type Bool"))
+ end
+ Base.depwarn("Selecting a single row from a `DataFrame` will return a `DataFrameRow` in the future. " *
+ "To get a `DataFrame` use `df[row_ind:row_ind, :]`.", :getindex)
+ new_columns = AbstractVector[[dv[row_ind]] for dv in columns(df)]
+ return DataFrame(new_columns, copy(index(df)))
+end
+
+# df[MultiRowIndex, SingleColumnIndex] => AbstractVector, copy
 function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_ind::ColumnIndex)
  selected_column = index(df)[col_ind]
  return columns(df)[selected_column][row_inds]
@@ -302,30 +296,28 @@ end
 # df[MultiRowIndex, MultiColumnIndex] => DataFrame
 function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_inds::AbstractVector)
  selected_columns = index(df)[col_inds]
- new_columns = Any[dv[row_inds] for dv in columns(df)[selected_columns]]
+ new_columns = AbstractVector[dv[row_inds] for dv in columns(df)[selected_columns]]
  return DataFrame(new_columns, Index(_names(df)[selected_columns]))
 end
 
 # df[:, SingleColumnIndex] => AbstractVector
 # df[:, MultiColumnIndex] => DataFrame
 function Base.getindex(df::DataFrame, row_ind::Colon, col_inds)
- Base.depwarn("indexing with colon as row will create a copy in the future" *
- " use df[col_inds] to get the columns without copying", :getindex)
+ Base.depwarn("Indexing with colon as row will create a copy in the future. " *
+ "Use `df[col_inds]` to get the columns without copying", :getindex)
  df[col_inds]
 end
 
-# df[SingleRowIndex, :] => DataFrame
-Base.getindex(df::DataFrame, row_ind::Real, col_inds::Colon) = df[[row_ind], col_inds]
-
 # df[MultiRowIndex, :] => DataFrame
-function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_inds::Colon)
- new_columns = Any[dv[row_inds] for dv in columns(df)]
+function Base.getindex(df::DataFrame, row_inds::AbstractVector, ::Colon)
+ new_columns = AbstractVector[dv[row_inds] for dv in columns(df)]
  return DataFrame(new_columns, copy(index(df)))
 end
 
 # df[:, :] => DataFrame
 function Base.getindex(df::DataFrame, ::Colon, ::Colon)
- Base.depwarn("indexing with colon as row will create a copy of rows in the future", :getindex)
+ Base.depwarn("Indexing with colon as row will create a copy of column vectors in the" *
+ " future. use `df[:]` to get the columns without copying", :getindex)
  copy(df)
 end
 

diff --git a/src/dataframerow/dataframerow.jl b/src/dataframerow/dataframerow.jl
@@ -4,7 +4,6 @@ struct DataFrameRow{T <: AbstractDataFrame}
  row::Int
 end
 
-
 """
  parent(r::DataFrameRow)
 
@@ -13,14 +12,13 @@ Return the parent data frame of `r`.
 Base.parent(r::DataFrameRow) = getfield(r, :df)
 row(r::DataFrameRow) = getfield(r, :row)
 
-function Base.getindex(r::DataFrameRow, idx::AbstractArray)
- return DataFrameRow(parent(r)[idx], row(r))
-end
-
 function Base.getindex(r::DataFrameRow, idx::ColumnIndex)
  return parent(r)[row(r), idx]
 end
 
+Base.getindex(r::DataFrameRow, idxs::AbstractVector) =
+ DataFrameRow(parent(r)[idxs], row(r))
+
 Base.getindex(r::DataFrameRow, ::Colon) = r
 
 function Base.setindex!(r::DataFrameRow, value::Any, idx::Any)
@@ -37,7 +35,17 @@ Base.setproperty!(r::DataFrameRow, idx::Symbol, x::Any) = setindex!(r, x, idx)
 # Private fields are never exposed since they can conflict with column names
 Base.propertynames(r::DataFrameRow, private::Bool=false) = names(r)
 
-Base.view(r::DataFrameRow, c) = DataFrameRow(parent(r)[[c]], row(r))
+function Base.view(r::DataFrameRow, col::ColumnIndex)
+ if col isa Bool
+ throw(ArgumentError("invalid column index: $col of type Bool"))
+ end
+ Base.depwarn("`view(dfr, col)` will return a `0`-dimensional view in the future." *
+ " Use `view(dfr, [col]` to get a `DataFrameRow`.", :getindex)
+ DataFrameRow(parent(r)[[col]], row(r))
+end
+
+Base.view(r::DataFrameRow, cols) = DataFrameRow(parent(r)[cols], row(r))
+Base.view(r::DataFrameRow, ::Colon) = r
 
 index(r::DataFrameRow) = index(parent(r))
 

diff --git a/src/deprecated.jl b/src/deprecated.jl
@@ -19,7 +19,7 @@ import Base: keys, values, insert!
 @deprecate complete_cases! dropmissing!
 @deprecate complete_cases completecases
 
-@deprecate sub(df::AbstractDataFrame, rows) view(df, rows)
+@deprecate sub(df::AbstractDataFrame, rows) view(df, rows, :)
 
 ## write.table
 using CodecZlib, TranscodingStreams

diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
@@ -83,7 +83,7 @@ function groupby(df::AbstractDataFrame, cols::Vector;
  df_groups = group_rows(sdf, skipmissing)
  # sort the groups
  if sort
- group_perm = sortperm(view(sdf, df_groups.rperm[df_groups.starts]))
+ group_perm = sortperm(view(sdf, df_groups.rperm[df_groups.starts], :))
  permute!(df_groups.starts, group_perm)
  Base.permute!!(df_groups.stops, group_perm)
  end
@@ -98,7 +98,7 @@ function Base.iterate(gd::GroupedDataFrame, i=1)
  if i > length(gd.starts)
  nothing
  else
- (view(gd.parent, gd.idx[gd.starts[i]:gd.ends[i]]), i+1)
+ (view(gd.parent, gd.idx[gd.starts[i]:gd.ends[i]], :), i+1)
  end
 end
 
@@ -108,7 +108,7 @@ Base.first(gd::GroupedDataFrame) = gd[1]
 Base.last(gd::GroupedDataFrame) = gd[end]
 
 Base.getindex(gd::GroupedDataFrame, idx::Integer) =
- view(gd.parent, gd.idx[gd.starts[idx]:gd.ends[idx]])
+ view(gd.parent, gd.idx[gd.starts[idx]:gd.ends[idx]], :)
 Base.getindex(gd::GroupedDataFrame, idxs::AbstractArray) =
  GroupedDataFrame(gd.parent, gd.cols, gd.idx, gd.starts[idxs], gd.ends[idxs])
 Base.getindex(gd::GroupedDataFrame, idxs::Colon) =

diff --git a/src/other/index.jl b/src/other/index.jl
@@ -136,48 +136,38 @@ function Base.insert!(x::Index, idx::Integer, nm::Symbol)
 end
 
 Base.getindex(x::AbstractIndex, idx::Symbol) = x.lookup[idx]
-Base.getindex(x::AbstractIndex, idx::AbstractVector{Symbol}) = [x.lookup[i] for i in idx]
 Base.getindex(x::AbstractIndex, idx::Bool) = throw(ArgumentError("invalid index: $idx of type Bool"))
 Base.getindex(x::AbstractIndex, idx::Integer) = Int(idx)
+
+Base.getindex(x::AbstractIndex, idx::AbstractVector{Symbol}) = [x.lookup[i] for i in idx]
+
 Base.getindex(x::AbstractIndex, idx::AbstractVector{Int}) = idx
 Base.getindex(x::AbstractIndex, idx::AbstractRange{Int}) = idx
 Base.getindex(x::AbstractIndex, idx::AbstractRange{<:Integer}) = collect(Int, idx)
-
-function Base.getindex(x::AbstractIndex, idx::AbstractVector{Bool})
- length(x) == length(idx) || throw(BoundsError(x, idx))
- findall(idx)
-end
-
-function Base.getindex(x::AbstractIndex, idx::AbstractVector{Union{Bool, Missing}})
- if any(ismissing, idx)
- throw(ArgumentError("missing values are not allowed for column indexing"))
- end
- getindex(x, collect(Missings.replace(idx, false)))
-end
-
 function Base.getindex(x::AbstractIndex, idx::AbstractVector{<:Integer})
  if any(v -> v isa Bool, idx)
  throw(ArgumentError("Bool values except for AbstractVector{Bool} are not allowed for column indexing"))
  end
  Vector{Int}(idx)
 end
 
+Base.getindex(x::AbstractIndex, idx::AbstractRange{Bool}) = getindex(x, collect(idx))
+function Base.getindex(x::AbstractIndex, idx::AbstractVector{Bool})
+ length(x) == length(idx) || throw(BoundsError(x, idx))
+ findall(idx)
+end
+
 # catch all method handling cases when type of idx is not narrowest possible, Any in particular
-# also it handles passing missing values in idx
-function Base.getindex(x::AbstractIndex, idx::AbstractVector)
- idxs = filter(!ismissing, idx)
- if length(idxs) != length(idx)
- throw(ArgumentError("missing values are not allowed for column indexing"))
- end
+function Base.getindex(x::DataFrames.AbstractIndex, idxs::AbstractVector)
  length(idxs) == 0 && return Int[] # special case of empty idxs
  if idxs[1] isa Real
  if !all(v -> v isa Integer && !(v isa Bool), idxs)
  throw(ArgumentError("Only Integer values allowed when indexing by vector of numbers"))
  end
- return Vector{Int}(idxs)
+ return convert(Vector{Int}, idxs)
  end
- idxs[1] isa Symbol && return getindex(x, Vector{Symbol}(idxs))
- throw(ArgumentError("idx[1] has type $(typeof(idx[1])); "*
+ idxs[1] isa Symbol && return getindex(x, convert(Vector{Symbol}, idxs))
+ throw(ArgumentError("idxs[1] has type $(typeof(idxs[1])); "*
  "DataFrame only supports indexing columns with integers, symbols or boolean vectors"))
 end