FEAT: add CsvProcessor to python bindings

NickCrews · Jul 29, 2023 · a27a74e · a27a74e
1 parent e3e7a42
commit a27a74e
Show file tree

Hide file tree

Showing 9 changed files with 114 additions and 42 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,9 +2,9 @@
 members = ["crates/*"]
 
 [workspace.package]
-version = "0.4.0"
+# KEEP IN SYNC WITH python/pyproject.toml
+version = "0.4.1"
 edition = "2021"
 rust-version = "1.69"
 homepage = "https://github.com/NickCrews/feco3"
-documentation = "https://github.com/NickCrews/feco3"
 repository = "https://github.com/NickCrews/feco3"
diff --git a/README.md b/README.md
@@ -1,10 +1,21 @@
 # FECo3
 
-A .FEC file parser in rust, with python bindings
+A .FEC file parser in rust, with python bindings. The rust is intended to
+be extendable, easy to maintain, and performant. The python is intended to
+be easy to use, with type hints, possible to extend,
+integrate with the rest of the python data ecosystem.
 
 Still in alpha.
 
-## Example
+## Links
+
+- [Python docs](https://nickcrews.github.io/feco3/), if you want to use the Python API.
+- [Rust docs](https://docs.rs/feco3), if you want to use the Rust API.
+- [.fec file format reference](https://github.com/NickCrews/feco3/wiki/.fec-File-Format)
+ if you want to know more about the .fec file format or are interested in writing
+ your own parser or improving this one.
+
+## Example Python
 
 ```python
 import pyarrow as pa
@@ -13,10 +24,17 @@ import feco3
 # ruff: noqa: E501
 
 # You can supply a URL or a path to a file.
+# Possibly in the future we'll support reading from a file-like object.
 src = "https://docquery.fec.gov/dcdev/posted/1002596.fec"
 # src = "path/to/file.fec"
 # src = pathlib.Path("path/to/file.fec")
 
+# The straightforward way is to just parse to a directory of files,
+# one file for each itemization type, eg "csvs/SA11AI.csv", etc
+feco3.FecFile(src).to_csvs("csvs/")
+feco3.FecFile(src).to_parquets("parquets/")
+
+# Or, you can look at the file at a lower level.
 # This doesn't actually read or parse any data yet
 fec = feco3.FecFile(src)
 print(fec)
@@ -63,13 +81,7 @@ for batch in batcher:
 
 ```
 
-## Documentation
 
-- [Python docs](https://nickcrews.github.io/feco3/), if you want to use the Python API
-- [Rust docs](https://docs.rs/feco3), if you want to write to the Rust API
-- [.fec file format reference](https://github.com/NickCrews/feco3/wiki/.fec-File-Format)
- if you want to know more about the .fec file format or are interested in writing
- your own parser or improving this one.
 
 ## Related projects
 

diff --git a/crates/feco3_python/src/lib.rs b/crates/feco3_python/src/lib.rs
@@ -111,6 +111,24 @@ impl ParquetProcessor {
  }
 }
 
+#[pyclass]
+struct CsvProcessor(feco3::writers::csv::CSVProcessor);
+
+#[pymethods]
+impl CsvProcessor {
+ #[new]
+ fn new(out_dir: PathBuf) -> Self {
+ Self(feco3::writers::csv::CSVProcessor::new(out_dir))
+ }
+
+ fn process(&mut self, fec_file: &mut FecFile) -> PyResult<()> {
+ match self.0.process(&mut fec_file.0) {
+ Ok(()) => Ok(()),
+ Err(e) => Err(to_py_err(e)),
+ }
+ }
+}
+
 #[pyclass]
 struct PyarrowBatcher(feco3::writers::arrow::RecordBatchProcessor);
 
@@ -144,6 +162,7 @@ fn _feco3(_py: Python, m: &PyModule) -> PyResult<()> {
  pyo3_log::init();
  m.add_class::<FecFile>()?;
  m.add_class::<ParquetProcessor>()?;
+ m.add_class::<CsvProcessor>()?;
  m.add_class::<PyarrowBatcher>()?;
  Ok(())
 }

diff --git a/python/example.py b/python/example.py
@@ -1,5 +1,5 @@
-import pyarrow as pa
 import feco3
+import pyarrow as pa
 
 # ruff: noqa: E501
 
@@ -8,6 +8,12 @@
 # src = "path/to/file.fec"
 # src = pathlib.Path("path/to/file.fec")
 
+# The straightforward way is to just parse to a directory of files,
+# one file for each itemization type, eg "csvs/SA11AI.csv", etc
+feco3.FecFile(src).to_csvs("csvs/")
+feco3.FecFile(src).to_parquets("parquets/")
+
+# Or, you can look at the file at a lower level.
 # This doesn't actually read or parse any data yet
 fec = feco3.FecFile(src)
 print(fec)

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -5,7 +5,8 @@ build-backend = "maturin"
 [project]
 name = "feco3"
 # TODO make this dynamic
-version = "0.4.0"
+# KEEP IN SYNC WITH ../Cargo.toml
+version = "0.4.1"
 description = "A Rust-backed Python library for parsing .fec files."
 requires-python = ">=3.7"
 readme = "README.md"

diff --git a/python/src/feco3/__init__.py b/python/src/feco3/__init__.py
@@ -1,16 +1,13 @@
 """FECo3: Python bindings to a .fec file parser written in Rust."""
 
 from __future__ import annotations
-from functools import cached_property
-from typing import NamedTuple
 
 import os
+from functools import cached_property
 from pathlib import Path
-from typing import TYPE_CHECKING
-
+from typing import TYPE_CHECKING, NamedTuple
 
-from . import _version
-from . import _feco3
+from . import _feco3, _version
 
 if TYPE_CHECKING:
  import pyarrow as pa
@@ -104,14 +101,22 @@ def cover(self) -> Cover:
  filer_committee_id=c.filer_committee_id,
  )
 
- def to_parquet(self, out_dir: str | os.PathLike) -> None:
+ def to_parquets(self, out_dir: str | os.PathLike) -> None:
  """Write all itemizations in this FEC file to parquet files.
 
  There will be one parquet file for each record type, eg. ``sa11.parquet``.
  """
  parser = _feco3.ParquetProcessor(out_dir)
  parser.process(self._wrapped)
 
+ def to_csvs(self, out_dir: str | os.PathLike) -> None:
+ """Write all itemizations in this FEC file to CSV files.
+
+ There will be one CSV file for each record type, eg. ``sa11.csv``.
+ """
+ parser = _feco3.CsvProcessor(out_dir)
+ parser.process(self._wrapped)
+
  def __repr__(self) -> str:
  src_str = f"src={self._src!r}"
  return f"{self.__class__.__name__}({src_str})"
@@ -139,7 +144,14 @@ class PyarrowBatcher:
  Iterates an [FecFile][feco3.FecFile] and yields [ItemizationBatch][feco3.ItemizationBatch]s of itemizations.
  """ # noqa: E501
 
- def __init__(self, fec_file: FecFile, max_batch_size: int | None = None):
+ def __init__(self, fec_file: FecFile, max_batch_size: int | None = None) -> None:
+ """Create a new PyarrowBatcher.
+
+ Args:
+ fec_file: The [FecFile][feco3.FecFile] to iterate.
+ max_batch_size: The max rows per [pyarrow.RecordBatch][pyarrow.RecordBatch].
+ Defaults to 1024 * 1024, which is what rust parquet uses.
+ """
  self._fec_file = fec_file
  if max_batch_size is None:
  max_batch_size = DEFAULT_PYARROW_RECORD_BATCH_MAX_SIZE

diff --git a/python/test/test_pyarrow.py b/python/test/test_pyarrow.py
diff --git a/python/test/test_writers.py b/python/test/test_writers.py
@@ -0,0 +1,42 @@
+from pathlib import Path
+
+import feco3
+import pyarrow as pa
+
+from . import common
+
+
+def test_pyarrow_batches():
+ path = common.get_case_path("slash_form.fec")
+ fec = feco3.FecFile(path)
+ batcher = feco3.PyarrowBatcher(fec)
+ # Can convert to list
+ batches = list(batcher)
+ assert len(batches) > 1
+ seen_codes = set()
+ for b in batches:
+ assert isinstance(b, feco3.ItemizationBatch)
+ assert isinstance(b.code, str)
+ assert isinstance(b.records, pa.RecordBatch)
+ assert b.records.num_rows > 0
+ assert b.records.num_columns > 0
+ seen_codes.add(b.code)
+
+ assert seen_codes == {"SA11AI", "SD10", "SC2/10", "SC/10", "SB17"}
+
+ # We have used up the fec file, so iterating again finds no itemizations
+ assert list(feco3.PyarrowBatcher(fec)) == []
+
+
+def test_csvs(tmp_path: Path):
+ path = common.get_case_path("slash_form.fec")
+ fec = feco3.FecFile(path)
+ fec.to_csvs(tmp_path)
+ assert len(list(tmp_path.glob("*.csv"))) == 5
+
+
+def test_parquets(tmp_path: Path):
+ path = common.get_case_path("slash_form.fec")
+ fec = feco3.FecFile(path)
+ fec.to_parquets(tmp_path)
+ assert len(list(tmp_path.glob("*.parquet"))) == 5