Skip to content

Commit

Permalink
Add create dwca tests
Browse files Browse the repository at this point in the history
  • Loading branch information
patkyn committed Jun 24, 2024
1 parent 0f50864 commit efba241
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 11 deletions.
12 changes: 8 additions & 4 deletions src/dwcahandler/dwca/base_dwca.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,22 +90,26 @@ def delete_records(self, records_to_delete: CsvFileType):
def validate_content(self, content_type_to_validate: list[str] = None, error_file: str = None):
pass

def remove_extensions(self, exclude_ext_files: list, output_dwca_path: str = './dwca/output/'):
def remove_extensions(self, exclude_ext_files: list, output_dwca_path: str):
self.extract_dwca(exclude_ext_files=exclude_ext_files)
self.generate_eml()
self.generate_meta()
self.write_dwca(output_dwca_path)

def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca_path: str = './dwca/output/'):
def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca_path: str):
self.extract_dwca()
self.delete_records(records_to_delete)
self.generate_eml()
self.generate_meta()
self.write_dwca(output_dwca_path)

def create_dwca(self, core_csv: Union[CsvFileType, DataFrameType], ext_csv_list: list[CsvFileType],
output_dwca_path: str = './dwca/output/', validate_content: bool = True,
def create_dwca(self, core_csv: Union[CsvFileType, DataFrameType], output_dwca_path: str,
ext_csv_list: list[CsvFileType] = None, validate_content: bool = True,
eml_content: Union[str, Eml] = ''):

if ext_csv_list is None:
ext_csv_list = []

self.extract_csv_content(core_csv, CoreOrExtType.CORE)

# Only validate core content
Expand Down
15 changes: 8 additions & 7 deletions src/dwcahandler/dwca/dwca_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ def list_dwc_terms() -> pd.DataFrame:

@staticmethod
def create_dwca(core_csv: Union[CsvFileType, DataFrameType],
ext_csv_list: list[Union [CsvFileType, DataFrameType]] = list,
output_dwca_path: str = './dwca/output/', validate_content: bool = True,
eml_content: Union [str, Eml] = ''):
output_dwca_path: str,
ext_csv_list: list[Union[CsvFileType, DataFrameType]] = None,
validate_content: bool = True,
eml_content: Union[str, Eml] = ''):
"""Create a suitable DwCA from a list of CSV files
:param core_csv: The core source
Expand All @@ -38,7 +39,7 @@ def create_dwca(core_csv: Union[CsvFileType, DataFrameType],
validate_content=validate_content, eml_content=eml_content)

@staticmethod
def remove_extension_files(dwca_file: str, ext_files: list, output_dwca_path: str = './dwca/output/'):
def remove_extension_files(dwca_file: str, ext_files: list, output_dwca_path: str):
"""Load a DwCA and remove extension files from it
:param dwca_file: The path to the DwCA
Expand All @@ -50,7 +51,7 @@ def remove_extension_files(dwca_file: str, ext_files: list, output_dwca_path: st

@staticmethod
def delete_records(dwca_file: str, records_to_delete: CsvFileType,
output_dwca_path: str = './dwca/output/'):
output_dwca_path: str):
"""Delete core records listed in the records_to_delete file from DwCA.
The specified keys listed in records_to_delete param must exist in the dwca core file
Expand All @@ -62,7 +63,7 @@ def delete_records(dwca_file: str, records_to_delete: CsvFileType,
output_dwca_path=output_dwca_path)

@staticmethod
def merge_dwca(dwca_file: str, delta_dwca_file: str, output_dwca_path: str, keys_lookup: dict = {},
def merge_dwca(dwca_file: str, delta_dwca_file: str, output_dwca_path: str, keys_lookup: dict = None,
extension_sync: bool = False, regen_ids: bool = False, validate_delta_content: bool = True):
"""Merge a DwCA with a delta DwCA of changes.
Expand All @@ -80,7 +81,7 @@ def merge_dwca(dwca_file: str, delta_dwca_file: str, output_dwca_path: str, keys
regen_ids=regen_ids, validate_delta=validate_delta_content)

@staticmethod
def validate_dwca(dwca_file: str, keys_lookup: dict = {}, error_file: str = None):
def validate_dwca(dwca_file: str, keys_lookup: dict = None, error_file: str = None):
"""Test a dwca for consistency
:param dwca_file: The path to the DwCA
Expand Down
4 changes: 4 additions & 0 deletions tests/input_files/sample/multimedia.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
occurrenceID,format,creator,license,type,identifier,documentId,rights
014826,jpg,Joyce TJ,CC BY 4.0,image,https://www.somelinks.come/image?image=d68f8c06,"The rights to all uploaded images are held under the specified Creative Commons license, by the contributor of the image and the primary organisation responsible for the project to which they are contributed."
014825,jpg,Joyce TJ,CC BY 4.0,image,https://www.somelinks.come/image?image=a5923083,"The rights to all uploaded images are held under the specified Creative Commons license, by the contributor of the image and the primary organisation responsible for the project to which they are contributed."
014824,jpg,Joyce TJ,CC BY 4.0,image,https://www.somelinks.come/image?image=7fab23e4,"The rights to all uploaded images are held under the specified Creative Commons license, by the contributor of the image and the primary organisation responsible for the project to which they are contributed."
13 changes: 13 additions & 0 deletions tests/input_files/sample/occurrence.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
occurrenceID,basisOfRecord,scientificName,license,decimalLatitude,decimalLongitude
014826,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-30.0000,144.0000
014825,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-31.1111,145.0000
014824,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-32.085431,100.828059
014823,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-33.097233,101.820888
014822,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-34.099936,102.821654
014821,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-35.893671,104.999974
014802,Human Observation,Alectryon coriaceus,CC-BY 4.0 (Int),-34.113747,120.889354
014801,Human Observation,Eucalyptus robusta,CC-BY 4.0 (Int),-36.0000,144.308848
014800,Human Observation,Arundo donax,CC-BY 4.0 (Int),-30.440251,146.240159
014799,Human Observation,Arundo donax,CC-BY 4.0 (Int),-31.547195,150.783246
014798,Human Observation,Arundo donax,CC-BY 4.0 (Int),-40.481117,150.823468
014792,Human Observation,Euphorbia paralias,CC-BY 4.0 (Int),-28.0000,115.0000
129 changes: 129 additions & 0 deletions tests/test_writedwca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from dwcahandler import DwcaHandler, CsvFileType, CoreOrExtType, Eml
from zipfile import ZipFile
from pathlib import Path
import xml.etree.ElementTree as ET
import re
import pandas as pd


def _get_namespace(element):
"""Get the namespace from a `{namespace}tag` formatted URI
param: element
"return: The namespace for the element
"""
m = re.match("\\{.*\\}", element.tag)
return m.group(0) if m else ''


def _get_eml_content():
return Eml(dataset_name='Sample Dataset',
description='A dataset sample',
license='sample license',
citation='sample citation',
rights='sample rights')


occurrence_sample_file = "./input_files/sample/occurrence.csv"
multimedia_sample_file = "./input_files/sample/multimedia.csv"
sample_occ_df = pd.read_csv(occurrence_sample_file)
sample_multimedia_df = pd.read_csv(multimedia_sample_file)


class TestPublish:
"""
Test for terms
"""

def test_generate_dwca_without_ext(self):
"""
Test that generated dwca is valid with core occ data
"""
core_csv = CsvFileType(files=["./input_files/sample/occurrence.csv"], keys=['occurrenceID'],
type=CoreOrExtType.CORE)
p = Path("temp")
p.mkdir(parents=True, exist_ok=True)
dwca_output_path = str(Path(p / "dwca.zip").absolute())
DwcaHandler.create_dwca(core_csv=core_csv, output_dwca_path=dwca_output_path,
eml_content=_get_eml_content())
with ZipFile(dwca_output_path, 'r') as zf:
files = zf.namelist()
assert 'meta.xml' in files
assert 'eml.xml' in files
core_file = ""
with zf.open('meta.xml') as meta_xml_file:
tree = ET.parse(meta_xml_file)
root = tree.getroot()
ns = _get_namespace(root)
assert ns == "{http:https://rs.tdwg.org/dwc/text/}"
core_node = root.find(f'{ns}{CoreOrExtType.CORE}')
assert core_node
fields = core_node.findall(f'{ns}field')
term_fields = [f.attrib.get('term') for f in fields]
assert len(term_fields) == len(sample_occ_df.columns)
for sample_col in sample_occ_df.columns:
assert any(sample_col in f for f in term_fields)
core_file = core_node.find(f'{ns}files').find(f'{ns}location').text

assert core_file
with zf.open(core_file) as occ_file:
df = pd.read_csv(occ_file)
pd.testing.assert_frame_equal(df.drop(columns=['id']), sample_occ_df)

zf.close()

def test_generate_dwca_with_ext(self):
"""
Test that generated dwca is valid with core occ and multimedia data
"""
core_csv = CsvFileType(files=["./input_files/sample/occurrence.csv"], keys=['occurrenceID'],
type=CoreOrExtType.CORE)
ext_csv = CsvFileType(files=["./input_files/sample/multimedia.csv"], keys=['occurrenceID'],
type=CoreOrExtType.EXTENSION)
p = Path("temp")
p.mkdir(parents=True, exist_ok=True)
dwca_output_path = str(Path(p / "dwca_with_ext.zip").absolute())
DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[ext_csv], output_dwca_path=dwca_output_path,
eml_content=_get_eml_content())
with ZipFile(dwca_output_path, 'r') as zf:
files = zf.namelist()
assert 'meta.xml' in files
assert 'eml.xml' in files
core_file = ""
with zf.open('meta.xml') as meta_xml_file:
tree = ET.parse(meta_xml_file)
root = tree.getroot()
ns = _get_namespace(root)
assert ns == "{http:https://rs.tdwg.org/dwc/text/}"
core_node = root.find(f'{ns}{CoreOrExtType.CORE}')
assert core_node
fields = core_node.findall(f'{ns}field')
term_fields = [f.attrib.get('term') for f in fields]
assert len(term_fields) == len(sample_occ_df.columns)
for sample_col in sample_occ_df.columns:
assert any(sample_col in f for f in term_fields)
core_file = core_node.find(f'{ns}files').find(f'{ns}location').text

ext_node = root.find(f'{ns}{CoreOrExtType.EXTENSION}')
assert ext_node
fields = ext_node.findall(f'{ns}field')
term_fields = [f.attrib.get('term') for f in fields]
assert len(term_fields) == len(sample_multimedia_df.columns)
for sample_m_col in sample_multimedia_df.columns:
assert any(sample_m_col in f for f in term_fields)
ext_file = ext_node.find(f'{ns}files').find(f'{ns}location').text

assert core_file
assert ext_file

with zf.open(core_file) as occ_file:
df = pd.read_csv(occ_file)
assert 'id' in df.columns
pd.testing.assert_frame_equal(df.drop(columns=['id']), sample_occ_df)

with zf.open(ext_file) as image_file:
df = pd.read_csv(image_file)
assert 'coreid' in df.columns
pd.testing.assert_frame_equal(df.drop(columns=['coreid']), sample_multimedia_df)

zf.close()

0 comments on commit efba241

Please sign in to comment.