From 3adac6fe91b66e10e12a7c4e4b26b0def07f64c2 Mon Sep 17 00:00:00 2001 From: drew2a Date: Thu, 23 May 2024 14:51:43 +0200 Subject: [PATCH] Update Knowledge Component documentation Enhanced the Knowledge Component's README with additional issue references, code snippets, and explanations. Improved readability by reformatting some sections and adding more context to certain parts of the document. Introduced a new section on Content Bundling, explaining its purpose, approach, and implementation details in Tribler. --- .../core/components/knowledge/README.md | 67 +++++++++++++++---- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/src/tribler/core/components/knowledge/README.md b/src/tribler/core/components/knowledge/README.md index 68adbb8c85..7116fdc6dc 100644 --- a/src/tribler/core/components/knowledge/README.md +++ b/src/tribler/core/components/knowledge/README.md @@ -1,5 +1,8 @@ Below you can find a brief description of the Knowledge Component. For more -information please check the source code and https://github.com/Tribler/tribler/issues/6214 +information please check the source code the following issues: + +- https://github.com/Tribler/tribler/issues/6214 +- https://github.com/Tribler/tribler/issues/7837 ## Database @@ -11,9 +14,10 @@ anonymize the peer, not their main public key is used, but a secondary key as de ```python # secondary key: - secondary_private_key_path = config.state_dir / config.trustchain.secondary_key_filename - self.secondary_key = self.load_or_create(secondary_private_key_path) +secondary_private_key_path = config.state_dir / config.trustchain.secondary_key_filename +self.secondary_key = self.load_or_create(secondary_private_key_path) ``` + https://github.com/Tribler/tribler/blame/26b0be8c4010a6546f9204bb2f0026597dea53a7/src/tribler/core/components/key/key_component.py#L24-L26 In the `KnowledgeGraph`, a `Statement` is an edge with the following (simplified) structure: @@ -26,6 +30,7 @@ class SimpleStatement: predicate: ResourceType object: str ``` + https://github.com/Tribler/tribler/blob/76de5620ed7f1e991e9d1adc4cb905ae8a5db8b3/src/tribler/core/components/database/db/layers/knowledge_data_access_layer.py#L60-L65 Where `ResourceType` is @@ -58,6 +63,7 @@ class ResourceType(IntEnum): TORRENT = 102 CONTENT_ITEM = 103 ``` + https://github.com/Tribler/tribler/blob/26b0be8c4010a6546f9204bb2f0026597dea53a7/src/tribler/core/components/database/db/layers/knowledge_data_access_layer.py#L32-L57 Statement examples: @@ -85,6 +91,7 @@ class Operation(IntEnum): ADD = 1 # +1 operation REMOVE = 2 # -1 operation ``` + https://github.com/Tribler/tribler/blob/26b0be8c4010a6546f9204bb2f0026597dea53a7/src/tribler/core/components/database/db/layers/knowledge_data_access_layer.py#L26-L29 All operations are recorded in the database, allowing for the calculation of the final score of a specific operation @@ -154,7 +161,8 @@ https://github.com/Tribler/tribler/blob/main/src/tribler/core/components/knowled The algorithm of the community's operation: 1. Every 5 seconds, we request 10 `StatementOperations` from a random peer. -https://github.com/Tribler/tribler/blob/44e2235e0b3bcdc12ae2fcd874bc058474973e5b/src/tribler/core/components/knowledge/community/knowledge_payload.py#L8-L18 + https://github.com/Tribler/tribler/blob/44e2235e0b3bcdc12ae2fcd874bc058474973e5b/src/tribler/core/components/knowledge/community/knowledge_payload.py#L8-L18 + ```python @dataclass class StatementOperation: @@ -168,28 +176,37 @@ class StatementOperation: clock: int # This is the lamport-like clock that unique for each quadruple {public_key, subject, predicate, object} creator_public_key: type_from_format('74s') ``` + 2. Upon receiving a response, we verify the signatures of the operations and their validity. + ```python def verify_signature(self, packed_message: bytes, key: Key, signature: bytes, operation: StatementOperation): - if not self.crypto.is_valid_signature(key, packed_message, signature): - raise InvalidSignature(f'Invalid signature for {operation}') + + +if not self.crypto.is_valid_signature(key, packed_message, signature): + raise InvalidSignature(f'Invalid signature for {operation}') ``` + https://github.com/Tribler/tribler/blob/44e2235e0b3bcdc12ae2fcd874bc058474973e5b/src/tribler/core/components/knowledge/community/knowledge_community.py#L126-L128 ```python def validate_operation(operation: StatementOperation): - validate_resource(operation.subject) - validate_resource(operation.object) - validate_operation(operation.operation) - validate_resource_type(operation.subject_type) - validate_resource_type(operation.predicate) + + +validate_resource(operation.subject) +validate_resource(operation.object) +validate_operation(operation.operation) +validate_resource_type(operation.subject_type) +validate_resource_type(operation.predicate) ``` + https://github.com/Tribler/tribler/blob/44e2235e0b3bcdc12ae2fcd874bc058474973e5b/src/tribler/core/components/knowledge/community/knowledge_community.py#L119-L124 3. When `StatementOperations` are requested from us, we select N random operations (the number of operations is specified in the request) and return them. Design decisions behind the Community: + - Knowledge operations gossiped transitively across the network - Pull-based gossip is used - For knowledge' integrity check ipv8 signatures are used @@ -197,8 +214,32 @@ Design decisions behind the Community: ## UI Two changes have been made to the UI: + 1. Elements for displaying tags (this feature is hided at the current moment). -![image](https://github.com/Tribler/tribler/assets/13798583/2c3e6b98-71fc-42be-ab65-25bcd9012215) + ![image](https://github.com/Tribler/tribler/assets/13798583/2c3e6b98-71fc-42be-ab65-25bcd9012215) 2. A dialog for editing metadata: -![image](https://github.com/Tribler/tribler/assets/13798583/14c309ef-8f30-4710-a02e-0338fda60508) + ![image](https://github.com/Tribler/tribler/assets/13798583/14c309ef-8f30-4710-a02e-0338fda60508) + +## Content Bundling + +"Content Bundle" is a strategic feature in Tribler aimed at enhancing the organization +and accessibility of digital content. It acts as an aggregation point for Content Items, +bundling them together under a single, cohesive unit. This structure allows users to +efficiently manage and access groups of related Content Items, simplifying navigation +and retrieval. Ideal for categorizing content that shares common themes, attributes, +or sources, the Content Bundle provides a streamlined way to handle complex sets of +information, making it easier for users to find and interact with a rich array of +content within the Tribler network. +https://github.com/Tribler/tribler/blob/main/src/tribler/core/components/knowledge/rules/content_bundling.py + +![image](https://github.com/Tribler/tribler/assets/13798583/762a62ec-12a3-4eb3-973c-be56395d2f9e) + +The general approach for grouping items is to calculate N-Grams with TFIDF for numbers in items and cluster them using +HDBSCAN. For full information, see https://github.com/Tribler/tribler/issues/7837. + +The actual implementation differs slightly from the general approach but is essentially a simplification and +optimization of it. + +Content Bundling is conditional. The condition is based on +the [Corrected Type-Token Ratio (CTTR)](https://core.ac.uk/download/pdf/82620241.pdf) formula. \ No newline at end of file