Add create dwca tests

AtlasOfLivingAustralia · Jun 24, 2024 · efba241 · efba241
1 parent 0f50864
commit efba241
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 11 deletions.
diff --git a/src/dwcahandler/dwca/base_dwca.py b/src/dwcahandler/dwca/base_dwca.py
@@ -90,22 +90,26 @@ def delete_records(self, records_to_delete: CsvFileType):
  def validate_content(self, content_type_to_validate: list[str] = None, error_file: str = None):
  pass
 
- def remove_extensions(self, exclude_ext_files: list, output_dwca_path: str = './dwca/output/'):
+ def remove_extensions(self, exclude_ext_files: list, output_dwca_path: str):
  self.extract_dwca(exclude_ext_files=exclude_ext_files)
  self.generate_eml()
  self.generate_meta()
  self.write_dwca(output_dwca_path)
 
- def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca_path: str = './dwca/output/'):
+ def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca_path: str):
  self.extract_dwca()
  self.delete_records(records_to_delete)
  self.generate_eml()
  self.generate_meta()
  self.write_dwca(output_dwca_path)
 
- def create_dwca(self, core_csv: Union[CsvFileType, DataFrameType], ext_csv_list: list[CsvFileType],
- output_dwca_path: str = './dwca/output/', validate_content: bool = True,
+ def create_dwca(self, core_csv: Union[CsvFileType, DataFrameType], output_dwca_path: str,
+ ext_csv_list: list[CsvFileType] = None, validate_content: bool = True,
  eml_content: Union[str, Eml] = ''):
+
+ if ext_csv_list is None:
+ ext_csv_list = []
+
  self.extract_csv_content(core_csv, CoreOrExtType.CORE)
 
  # Only validate core content

diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py
@@ -23,9 +23,10 @@ def list_dwc_terms() -> pd.DataFrame:
 
  @staticmethod
  def create_dwca(core_csv: Union[CsvFileType, DataFrameType],
- ext_csv_list: list[Union [CsvFileType, DataFrameType]] = list,
- output_dwca_path: str = './dwca/output/', validate_content: bool = True,
- eml_content: Union [str, Eml] = ''):
+ output_dwca_path: str,
+ ext_csv_list: list[Union[CsvFileType, DataFrameType]] = None,
+ validate_content: bool = True,
+ eml_content: Union[str, Eml] = ''):
  """Create a suitable DwCA from a list of CSV files
 
  :param core_csv: The core source
@@ -38,7 +39,7 @@ def create_dwca(core_csv: Union[CsvFileType, DataFrameType],
  validate_content=validate_content, eml_content=eml_content)
 
  @staticmethod
- def remove_extension_files(dwca_file: str, ext_files: list, output_dwca_path: str = './dwca/output/'):
+ def remove_extension_files(dwca_file: str, ext_files: list, output_dwca_path: str):
  """Load a DwCA and remove extension files from it
 
  :param dwca_file: The path to the DwCA
@@ -50,7 +51,7 @@ def remove_extension_files(dwca_file: str, ext_files: list, output_dwca_path: st
 
  @staticmethod
  def delete_records(dwca_file: str, records_to_delete: CsvFileType,
- output_dwca_path: str = './dwca/output/'):
+ output_dwca_path: str):
  """Delete core records listed in the records_to_delete file from DwCA.
  The specified keys listed in records_to_delete param must exist in the dwca core file
 
@@ -62,7 +63,7 @@ def delete_records(dwca_file: str, records_to_delete: CsvFileType,
  output_dwca_path=output_dwca_path)
 
  @staticmethod
- def merge_dwca(dwca_file: str, delta_dwca_file: str, output_dwca_path: str, keys_lookup: dict = {},
+ def merge_dwca(dwca_file: str, delta_dwca_file: str, output_dwca_path: str, keys_lookup: dict = None,
  extension_sync: bool = False, regen_ids: bool = False, validate_delta_content: bool = True):
  """Merge a DwCA with a delta DwCA of changes.
 
@@ -80,7 +81,7 @@ def merge_dwca(dwca_file: str, delta_dwca_file: str, output_dwca_path: str, keys
  regen_ids=regen_ids, validate_delta=validate_delta_content)
 
  @staticmethod
- def validate_dwca(dwca_file: str, keys_lookup: dict = {}, error_file: str = None):
+ def validate_dwca(dwca_file: str, keys_lookup: dict = None, error_file: str = None):
  """Test a dwca for consistency
 
  :param dwca_file: The path to the DwCA

diff --git a/tests/input_files/sample/multimedia.csv b/tests/input_files/sample/multimedia.csv
@@ -0,0 +1,4 @@
+occurrenceID,format,creator,license,type,identifier,documentId,rights
+014826,jpg,Joyce TJ,CC BY 4.0,image,https://www.somelinks.come/image?image=d68f8c06,"The rights to all uploaded images are held under the specified Creative Commons license, by the contributor of the image and the primary organisation responsible for the project to which they are contributed."
+014825,jpg,Joyce TJ,CC BY 4.0,image,https://www.somelinks.come/image?image=a5923083,"The rights to all uploaded images are held under the specified Creative Commons license, by the contributor of the image and the primary organisation responsible for the project to which they are contributed."
+014824,jpg,Joyce TJ,CC BY 4.0,image,https://www.somelinks.come/image?image=7fab23e4,"The rights to all uploaded images are held under the specified Creative Commons license, by the contributor of the image and the primary organisation responsible for the project to which they are contributed."
diff --git a/tests/input_files/sample/occurrence.csv b/tests/input_files/sample/occurrence.csv
@@ -0,0 +1,13 @@
+occurrenceID,basisOfRecord,scientificName,license,decimalLatitude,decimalLongitude
+014826,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-30.0000,144.0000
+014825,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-31.1111,145.0000
+014824,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-32.085431,100.828059
+014823,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-33.097233,101.820888
+014822,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-34.099936,102.821654
+014821,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-35.893671,104.999974
+014802,Human Observation,Alectryon coriaceus,CC-BY 4.0 (Int),-34.113747,120.889354
+014801,Human Observation,Eucalyptus robusta,CC-BY 4.0 (Int),-36.0000,144.308848
+014800,Human Observation,Arundo donax,CC-BY 4.0 (Int),-30.440251,146.240159
+014799,Human Observation,Arundo donax,CC-BY 4.0 (Int),-31.547195,150.783246
+014798,Human Observation,Arundo donax,CC-BY 4.0 (Int),-40.481117,150.823468
+014792,Human Observation,Euphorbia paralias,CC-BY 4.0 (Int),-28.0000,115.0000
diff --git a/tests/test_writedwca.py b/tests/test_writedwca.py
@@ -0,0 +1,129 @@
+from dwcahandler import DwcaHandler, CsvFileType, CoreOrExtType, Eml
+from zipfile import ZipFile
+from pathlib import Path
+import xml.etree.ElementTree as ET
+import re
+import pandas as pd
+
+
+def _get_namespace(element):
+ """Get the namespace from a `{namespace}tag` formatted URI
+
+ param: element
+ "return: The namespace for the element
+ """
+ m = re.match("\\{.*\\}", element.tag)
+ return m.group(0) if m else ''
+
+
+def _get_eml_content():
+ return Eml(dataset_name='Sample Dataset',
+ description='A dataset sample',
+ license='sample license',
+ citation='sample citation',
+ rights='sample rights')
+
+
+occurrence_sample_file = "./input_files/sample/occurrence.csv"
+multimedia_sample_file = "./input_files/sample/multimedia.csv"
+sample_occ_df = pd.read_csv(occurrence_sample_file)
+sample_multimedia_df = pd.read_csv(multimedia_sample_file)
+
+
+class TestPublish:
+ """
+ Test for terms
+ """
+
+ def test_generate_dwca_without_ext(self):
+ """
+ Test that generated dwca is valid with core occ data
+ """
+ core_csv = CsvFileType(files=["./input_files/sample/occurrence.csv"], keys=['occurrenceID'],
+ type=CoreOrExtType.CORE)
+ p = Path("temp")
+ p.mkdir(parents=True, exist_ok=True)
+ dwca_output_path = str(Path(p / "dwca.zip").absolute())
+ DwcaHandler.create_dwca(core_csv=core_csv, output_dwca_path=dwca_output_path,
+ eml_content=_get_eml_content())
+ with ZipFile(dwca_output_path, 'r') as zf:
+ files = zf.namelist()
+ assert 'meta.xml' in files
+ assert 'eml.xml' in files
+ core_file = ""
+ with zf.open('meta.xml') as meta_xml_file:
+ tree = ET.parse(meta_xml_file)
+ root = tree.getroot()
+ ns = _get_namespace(root)
+ assert ns == "{http:https://rs.tdwg.org/dwc/text/}"
+ core_node = root.find(f'{ns}{CoreOrExtType.CORE}')
+ assert core_node
+ fields = core_node.findall(f'{ns}field')
+ term_fields = [f.attrib.get('term') for f in fields]
+ assert len(term_fields) == len(sample_occ_df.columns)
+ for sample_col in sample_occ_df.columns:
+ assert any(sample_col in f for f in term_fields)
+ core_file = core_node.find(f'{ns}files').find(f'{ns}location').text
+
+ assert core_file
+ with zf.open(core_file) as occ_file:
+ df = pd.read_csv(occ_file)
+ pd.testing.assert_frame_equal(df.drop(columns=['id']), sample_occ_df)
+
+ zf.close()
+
+ def test_generate_dwca_with_ext(self):
+ """
+ Test that generated dwca is valid with core occ and multimedia data
+ """
+ core_csv = CsvFileType(files=["./input_files/sample/occurrence.csv"], keys=['occurrenceID'],
+ type=CoreOrExtType.CORE)
+ ext_csv = CsvFileType(files=["./input_files/sample/multimedia.csv"], keys=['occurrenceID'],
+ type=CoreOrExtType.EXTENSION)
+ p = Path("temp")
+ p.mkdir(parents=True, exist_ok=True)
+ dwca_output_path = str(Path(p / "dwca_with_ext.zip").absolute())
+ DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[ext_csv], output_dwca_path=dwca_output_path,
+ eml_content=_get_eml_content())
+ with ZipFile(dwca_output_path, 'r') as zf:
+ files = zf.namelist()
+ assert 'meta.xml' in files
+ assert 'eml.xml' in files
+ core_file = ""
+ with zf.open('meta.xml') as meta_xml_file:
+ tree = ET.parse(meta_xml_file)
+ root = tree.getroot()
+ ns = _get_namespace(root)
+ assert ns == "{http:https://rs.tdwg.org/dwc/text/}"
+ core_node = root.find(f'{ns}{CoreOrExtType.CORE}')
+ assert core_node
+ fields = core_node.findall(f'{ns}field')
+ term_fields = [f.attrib.get('term') for f in fields]
+ assert len(term_fields) == len(sample_occ_df.columns)
+ for sample_col in sample_occ_df.columns:
+ assert any(sample_col in f for f in term_fields)
+ core_file = core_node.find(f'{ns}files').find(f'{ns}location').text
+
+ ext_node = root.find(f'{ns}{CoreOrExtType.EXTENSION}')
+ assert ext_node
+ fields = ext_node.findall(f'{ns}field')
+ term_fields = [f.attrib.get('term') for f in fields]
+ assert len(term_fields) == len(sample_multimedia_df.columns)
+ for sample_m_col in sample_multimedia_df.columns:
+ assert any(sample_m_col in f for f in term_fields)
+ ext_file = ext_node.find(f'{ns}files').find(f'{ns}location').text
+
+ assert core_file
+ assert ext_file
+
+ with zf.open(core_file) as occ_file:
+ df = pd.read_csv(occ_file)
+ assert 'id' in df.columns
+ pd.testing.assert_frame_equal(df.drop(columns=['id']), sample_occ_df)
+
+ with zf.open(ext_file) as image_file:
+ df = pd.read_csv(image_file)
+ assert 'coreid' in df.columns
+ pd.testing.assert_frame_equal(df.drop(columns=['coreid']), sample_multimedia_df)
+
+ zf.close()