Skip to content

Commit

Permalink
Shrink dump data model after parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mmarx committed Dec 10, 2023
1 parent 85488c7 commit a4bed87
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 2 deletions.
3 changes: 3 additions & 0 deletions helpers/rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ authors = ["Maximilian Marx <[email protected]>"]
edition = "2021"
license = "Apache-2.0"

[features]
full-json-model = []

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
Expand Down
158 changes: 157 additions & 1 deletion helpers/rust/src/types/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,20 @@ impl SiteRecord {
items: 0,
}
}

pub fn shrink_to_fit(&mut self) {
if let Some(url_pattern) = &mut self.url_pattern {
url_pattern.shrink_to_fit();
}

if let Some(group) = &mut self.group {
group.shrink_to_fit();
}

if let Some(language) = &mut self.language {
language.shrink_to_fit();
}
}
}

#[derive(Debug, Default, PartialEq, Eq, Deserialize, Serialize)]
Expand Down Expand Up @@ -467,6 +481,23 @@ pub(crate) mod dump {
},
}

impl Record {
pub fn shrink_to_fit(&mut self) {
match self {
Self::Item {
sitelinks, common, ..
} => {
sitelinks.shrink_to_fit();
sitelinks
.values_mut()
.for_each(|sitelink| sitelink.shrink_to_fit());
common.shrink_to_fit();
}
Self::Property { common, .. } => common.shrink_to_fit(),
}
}
}

#[derive(Debug, PartialEq, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct CommonData {
Expand All @@ -488,6 +519,27 @@ pub(crate) mod dump {
}

impl CommonData {
pub fn shrink_to_fit(&mut self) {
self.labels.shrink_to_fit();
self.labels
.values_mut()
.for_each(|label| label.shrink_to_fit());
self.descriptions.shrink_to_fit();
self.descriptions
.values_mut()
.for_each(|description| description.shrink_to_fit());
self.aliases.shrink_to_fit();
self.aliases.values_mut().for_each(|aliases| {
aliases.shrink_to_fit();
aliases.iter_mut().for_each(|alias| alias.shrink_to_fit());
});
self.claims.shrink_to_fit();
self.claims.values_mut().for_each(|claims| {
claims.shrink_to_fit();
claims.iter_mut().for_each(|claim| claim.shrink_to_fit());
});
}

pub(crate) fn label_for(&self, language: &str) -> Option<String> {
self.labels.get(language).map(|label| label.value.clone())
}
Expand All @@ -501,12 +553,14 @@ pub(crate) mod dump {
#[serde(rename_all = "camelCase", tag = "type")]
pub enum Statement {
Statement {
#[cfg(feature = "full-json-model")]
id: String,
mainsnak: Snak,
#[serde(default)]
rank: Rank,
#[serde(default)]
qualifiers: HashMap<Property, Vec<Snak>>,
#[cfg(feature = "full-json-model")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
qualifiers_order: Vec<Property>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
Expand All @@ -515,6 +569,38 @@ pub(crate) mod dump {
}

impl Statement {
pub fn shrink_to_fit(&mut self) {
match self {
Self::Statement {
#[cfg(feature = "full-json-model")]
id,
mainsnak,
qualifiers,
references,
#[cfg(feature = "full-json-model")]
qualifiers_order,
..
} => {
#[cfg(feature = "full-json-model")]
id.shrink_to_fit();
mainsnak.shrink_to_fit();
qualifiers.shrink_to_fit();
qualifiers.values_mut().for_each(|qualifiers| {
qualifiers.shrink_to_fit();
qualifiers
.iter_mut()
.for_each(|qualifier| qualifier.shrink_to_fit());
});
references.shrink_to_fit();
references
.iter_mut()
.for_each(|reference| reference.shrink_to_fit());
#[cfg(feature = "full-json-model")]
qualifiers_order.shrink_to_fit();
}
}
}

pub fn mainsnak(&self) -> &Snak {
match self {
Statement::Statement { mainsnak, .. } => mainsnak,
Expand Down Expand Up @@ -548,13 +634,28 @@ pub(crate) mod dump {
badges: Vec<Item>,
}

impl Sitelink {
pub fn shrink_to_fit(&mut self) {
self.site.shrink_to_fit();
self.title.shrink_to_fit();
self.badges.shrink_to_fit();
}
}

#[derive(Debug, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct LanguageValue {
language: String,
value: String,
}

impl LanguageValue {
pub fn shrink_to_fit(&mut self) {
self.language.shrink_to_fit();
self.value.shrink_to_fit();
}
}

#[derive(Copy, Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub enum Rank {
Expand Down Expand Up @@ -587,6 +688,13 @@ pub(crate) mod dump {
}

impl Snak {
pub fn shrink_to_fit(&mut self) {
match self {
Self::Value { datavalue, .. } => datavalue.shrink_to_fit(),
Self::SomeValue { .. } | Self::NoValue { .. } => (),
}
}

pub fn as_data_value(&self) -> Option<&DataValue> {
match self {
Snak::Value {
Expand All @@ -602,13 +710,30 @@ pub(crate) mod dump {
#[derive(Default, Debug, PartialEq, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
pub struct Reference {
#[cfg(feature = "full-json-model")]
hash: String,
#[cfg(feature = "full-json-model")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
snaks_order: Vec<Property>,
#[serde(skip_serializing_if = "HashMap::is_empty")]
pub(crate) snaks: HashMap<Property, Vec<Snak>>,
}

impl Reference {
pub fn shrink_to_fit(&mut self) {
#[cfg(feature = "full-json-model")]
{
self.hash.shrink_to_fit();
self.snaks_order.shrink_to_fit();
}
self.snaks.shrink_to_fit();
self.snaks.values_mut().for_each(|snaks| {
snaks.shrink_to_fit();
snaks.iter_mut().for_each(|snak| snak.shrink_to_fit());
});
}
}

#[derive(Debug, PartialEq, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case", tag = "type")]
pub enum DataValue {
Expand All @@ -635,6 +760,17 @@ pub(crate) mod dump {
}

impl DataValue {
pub fn shrink_to_fit(&mut self) {
match self {
DataValue::String { value } => value.shrink_to_fit(),
DataValue::Quantity { value } => value.shrink_to_fit(),
DataValue::MonolingualText { value } => value.shrink_to_fit(),
DataValue::WikibaseEntityid { .. }
| DataValue::GlobeCoordinate { .. }
| DataValue::Time { .. } => (),
}
}

pub fn as_entity_id(&self) -> Option<&EntityId> {
match self {
DataValue::WikibaseEntityid { value } => Some(value),
Expand All @@ -648,8 +784,9 @@ pub(crate) mod dump {
pub struct EntityId {
pub(crate) entity_type: EntityType,
pub(crate) id: Entity,
#[cfg(feature = "full-json-model")]
#[serde(skip_serializing_if = "Option::is_none")]
numeric_id: Option<u64>,
numeric_id: Option<u32>,
}

#[derive(Debug, PartialEq, Eq, Deserialize, Serialize)]
Expand Down Expand Up @@ -692,6 +829,18 @@ pub(crate) mod dump {
lowerbound: Option<String>,
}

impl Quantity {
pub fn shrink_to_fit(&mut self) {
if let Some(bound) = self.upperbound.as_mut() {
bound.shrink_to_fit()
}

if let Some(bound) = self.lowerbound.as_mut() {
bound.shrink_to_fit()
}
}
}

#[derive(Debug, PartialEq, Eq, Deserialize, Serialize)]
pub struct Time {
#[serde(
Expand All @@ -715,6 +864,13 @@ pub(crate) mod dump {
text: String,
}

impl MonolingualText {
pub fn shrink_to_fit(&mut self) {
self.language.shrink_to_fit();
self.text.shrink_to_fit();
}
}

#[derive(Debug, PartialEq, Eq, Deserialize_repr, Serialize_repr)]
#[repr(u8)]
pub enum TimePrecision {
Expand Down
9 changes: 8 additions & 1 deletion helpers/rust/src/types/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ impl DumpStatistics {
log::info!("Added {} indirect subclass relationships", added);

result.statistics.sites = sites.collect();
result.statistics.sites.shrink_to_fit();
result
.statistics
.sites
.values_mut()
.for_each(|sitelink| sitelink.shrink_to_fit());
log::info!("Got {} sitelink records", result.statistics.sites.len());

result
Expand Down Expand Up @@ -142,8 +148,9 @@ impl DumpStatistics {

log::trace!("parsing record: {raw_record:?}");

let record: Record =
let mut record: Record =
serde_json::from_str(raw_record).context("Failed parsing the record")?;
record.shrink_to_fit();

match record {
Record::Item {
Expand Down

0 comments on commit a4bed87

Please sign in to comment.