Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

better filestack performance #614

Merged
merged 25 commits into from
Mar 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
cdm sourcetype
  • Loading branch information
rafaqz committed Mar 23, 2024
commit 6fc3f8b8f7be03be008c5f2ee3fc670e29950dbc
1 change: 1 addition & 0 deletions ext/RastersArchGDALExt/gdal_source.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ end

RA.cleanreturn(A::AG.RasterDataset) = Array(A)
RA.haslayers(::GDALsource) = false
RA._sourcetype(A::AG.RasterDataset) = GDALsource()

"""
Base.write(filename::AbstractString, ::GDALsource, A::AbstractRaster; force=false, kw...)
Expand Down
1 change: 1 addition & 0 deletions ext/RastersGRIBDatasetsExt/gribdatasets_source.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ end
# Hack to get the inner DiskArrays chunks as they are not exposed at the top level
RA._get_eachchunk(var::GDS.Variable) = DiskArrays.eachchunk(var.values)
RA._get_haschunks(var::GDS.Variable) = DiskArrays.haschunks(var.values)
RA._sourcetype(::GDS.Variable) = GRIBsource()
3 changes: 3 additions & 0 deletions ext/RastersNCDatasetsExt/ncdatasets_source.jl
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ end
RA._get_eachchunk(var::NCD.Variable) = DiskArrays.eachchunk(var)
RA._get_haschunks(var::NCD.Variable) = DiskArrays.haschunks(var)

RA._sourcetype(::NCD.Dataset) = NCDsource()
RA._sourcetype(::NCD.Variable) = NCDsource()

# precompilation

# const _NCDVar = NCDatasets.CFVariable{Union{Missing, Float32}, 3, NCDatasets.Variable{Float32, 3, NCDatasets.NCDataset}, NCDatasets.Attributes{NCDatasets.NCDataset{Nothing}}, NamedTuple{(:fillvalue, :scale_factor, :add_offset, :calendar, :time_origin, :time_factor), Tuple{Float32, Nothing, Nothing, Nothing, Nothing, Nothing}}}
Expand Down
55 changes: 30 additions & 25 deletions src/sources/commondatamodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,14 @@ end
DiskArrays.eachchunk(var::CFDiskArray) = _get_eachchunk(var)
DiskArrays.haschunks(var::CFDiskArray) = _get_haschunks(var)

_get_eachchunk(var::CFDiskArray) = _get_eachchunk(var.var)
_get_eachchunk(var::CFDiskArray) = _get_eachchunk(parent(var))
_get_eachchunk(var::CDM.CFVariable) = _get_eachchunk(var.var)
_get_haschunks(var::CFDiskArray) = _get_haschunks(var.var)
_get_haschunks(var::CFDiskArray) = _get_haschunks(parent(var))
_get_haschunks(var::CDM.CFVariable) = _get_haschunks(var.var)

_sourcetype(var::CFDiskArray) = _sourcetype(parent(var))
_sourcetype(var::CDM.CFVariable) = _sourcetype(var.var)

# CommonDataModel.jl methods
for method in (:size, :name, :dimnames, :dataset, :attribnames)
@eval begin
Expand All @@ -90,21 +93,21 @@ end
# Rasters methods for CDM types ###############################

# This is usually called inside a closure and cleaned up in `cleanreturn`
function Raster(ds::AbstractDataset, filename::AbstractString, key::Nothing=nothing;
function Raster(ds::AbstractDataset, filename::AbstractString, key::Nothing=nothing;
source=nothing, kw...
)
source = isnothing(source) ? _sourcetype(filename) : _sourcetype(source)
# Find the first valid variable
layers = _layers(ds)
for (key, var) in zip(layers.keys, layers.vars)
for (key, var) in zip(layers.keys, layers.vars)
if ndims(var) > 0
@info "No `name` or `key` keyword provided, using first valid layer with name `:$key`"
return Raster(CFDiskArray(var), filename, key; source, kw...)
end
end
throw(ArgumentError("dataset at $filename has no array variables"))
end
function Raster(ds::AbstractDataset, filename::AbstractString, key::Union{AbstractString,Symbol};
function Raster(ds::AbstractDataset, filename::AbstractString, key::Union{AbstractString,Symbol};
source=nothing, kw...
)
return Raster(CFDiskArray(ds[key]), filename, key; source)
Expand All @@ -119,7 +122,7 @@ function FileArray{source}(var::AbstractVariable, filename::AbstractString; kw..
end

function FileStack{source}(
ds::AbstractDataset, filename::AbstractString;
ds::AbstractDataset, filename::AbstractString;
write::Bool=false, keys::NTuple{N,Symbol}, vars
) where {source<:CDMsource,N}
layertypes = map(var -> Union{Missing,eltype(var)}, vars)
Expand All @@ -144,12 +147,12 @@ _open(f, ::CDMsource, var::CFDiskArray; kw...) = cleanreturn(f(var))

# TODO fix/test this for RasterStack
function create(filename, source::CDMsource, T::Union{Type,Tuple}, dims::DimTuple;
name=:layer1,
keys=(name,),
layerdims=map(_ -> dims, keys),
name=:layer1,
keys=(name,),
layerdims=map(_ -> dims, keys),
missingval=nothing,
metadata=NoMetadata(),
lazy=true,
metadata=NoMetadata(),
lazy=true,
)
types = T isa Tuple ? T : Ref(T)
missingval = T isa Tuple ? missingval : Ref(missingval)
Expand Down Expand Up @@ -215,16 +218,18 @@ end
function _layerdims(var::AbstractVariable)
map(CDM.dimnames(var)) do dimname
_cdmdim(CDM.dataset(var), dimname)
end |> Tuple
end |> Tuple
end
_metadata(var::AbstractVariable; attr=CDM.attribs(var)) = _metadatadict(CDMsource(), attr)
_metadata(var::AbstractVariable; attr=CDM.attribs(var)) =
_metadatadict(_sourcetype(var), attr)

function _dims(ds::AbstractDataset, crs=nothing, mappedcrs=nothing)
map(CDM.dimnames(ds)) do key
_cdmdim(ds, key, crs, mappedcrs)
end |> Tuple
end
_metadata(ds::AbstractDataset; attr=CDM.attribs(ds)) = _metadatadict(CDMsource(), attr)
_metadata(ds::AbstractDataset; attr=CDM.attribs(ds)) =
_metadatadict(_sourcetype(ds), attr)
function _layerdims(ds::AbstractDataset; layers)
dimdict = map(CDM.dimnames(ds)) do dimname
dimname => _cdmdimtype(ds, dimname)
Expand All @@ -237,7 +242,7 @@ function _layerdims(ds::AbstractDataset; layers)
end
function _layermetadata(ds::AbstractDataset; layers)
map(layers.attrs) do attr
md = _metadatadict(CDMsource(), attr)
md = _metadatadict(_sourcetype(ds), attr)
if haskey(attr, "grid_mapping")
md["grid_mapping"] = Dict(attr["grid_mapping"])
end
Expand Down Expand Up @@ -283,20 +288,20 @@ end
# Find the matching dimension constructor. If its an unknown name
# use the generic Dim with the dim name as type parameter
function _cdmdimtype(attrib, dimname)
if haskey(attrib, "axis")
k = attrib["axis"]
if haskey(CDM_AXIS_MAP, k)
return CDM_AXIS_MAP[k]
if haskey(attrib, "axis")
k = attrib["axis"]
if haskey(CDM_AXIS_MAP, k)
return CDM_AXIS_MAP[k]
end
end
if haskey(attrib, "standard_name")
k = attrib["standard_name"]
if haskey(CDM_STANDARD_NAME_MAP, k)
if haskey(CDM_STANDARD_NAME_MAP, k)
return CDM_STANDARD_NAME_MAP[k]
end
end
if haskey(CDM_DIM_MAP, dimname)
return CDM_DIM_MAP[dimname]
if haskey(CDM_DIM_MAP, dimname)
return CDM_DIM_MAP[dimname]
end
return DD.basetypeof(DD.key2dim(Symbol(dimname)))
end
Expand All @@ -307,7 +312,7 @@ function _cdmlookup(ds::AbstractDataset, dimname, D::Type, crs, mappedcrs)
var = ds[dimname]
index = var[:]
attr = CDM.attribs(var)
metadata = _metadatadict(CDMsource(), attr)
metadata = _metadatadict(_sourcetype(ds), attr)
return _cdmlookup(ds, var, attr, dimname, D, index, metadata, crs, mappedcrs)
end
# For unknown types we just make a Categorical lookup
Expand All @@ -318,7 +323,7 @@ end
# We need to include `Missing` in unions in case `_FillValue` is used
# on coordinate variables in a file and propagates here.
function _cdmlookup(
ds::AbstractDataset, var, attr, dimname,
ds::AbstractDataset, var, attr, dimname,
D::Type, index::AbstractArray{<:Union{Missing,Number,Dates.AbstractTime}},
metadata, crs, mappedcrs
)
Expand Down Expand Up @@ -432,7 +437,7 @@ _attribdict(md) = Dict{String,Any}()
# We need to get better at guaranteeing if X/Y is actually measured in `longitude/latitude`
# CF standards requires that we specify "units" if we use these standard names
_cdm_set_axis_attrib!(atr, dim::X) = atr["axis"] = "X" # at["standard_name"] = "longitude";
_cdm_set_axis_attrib!(atr, dim::Y) = atr["axis"] = "Y" # at["standard_name"] = "latitude";
_cdm_set_axis_attrib!(atr, dim::Y) = atr["axis"] = "Y" # at["standard_name"] = "latitude";
_cdm_set_axis_attrib!(atr, dim::Z) = (atr["axis"] = "Z"; atr["standard_name"] = "depth")
_cdm_set_axis_attrib!(atr, dim::Ti) = (atr["axis"] = "T"; atr["standard_name"] = "time")
_cdm_set_axis_attrib!(atr, dim) = nothing
Expand Down
7 changes: 0 additions & 7 deletions test/sources/ncdatasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,6 @@ stackkeys = (
end

@testset "Raster" begin
using Cthulhu
using ProfileView, SnoopCompile
tinf = @snoopi_deep Raster(ncsingle; name=:tos)
fg = flamegraph(tinf)
ProfileView.view(fg)

@profview Raster(ncsingle; name=:tos)
@time ncarray = Raster(ncsingle);

@time lazyarray = Raster(ncsingle; lazy=true);
Expand Down
Loading