Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Girotomas #18

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
feat: adding the scaffolding to match the author links
  • Loading branch information
girotomas committed Mar 29, 2024
commit 36f8cc4076722f8859ac1f07b54b3e0fa9cd845e
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ There is maintained work item list [here](task_backlog.md) that you can pick up

Love what we're doing? Star the repository and spread the word! Your support helps in keeping this initiative active and growing.

# How to setup the environment ?
## How to setup the environment ?

```
./setup.sh
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pandas as pd
import os
import re
from re import Pattern
from typing import Dict, List
from abc import ABC, abstractmethod


from src.populate_csv_files.parse_new_data import url_patterns
from src.utils import root_directory
Expand Down Expand Up @@ -43,10 +46,58 @@ def match_website_base(article_url, url_patterns):
return f"Error: {e}"
return ", ".join(base_urls) # Join unique base URLs with comma separator


class Url:

def __init__(self, value: str):
self.value = value

class ArticleMatcher(ABC):

@abstractmethod
def matches(self, url: Url) -> bool:
pass

@abstractmethod
def get_author_link(self, url: Url) -> Url:
pass

class MatchNotFoundException(Exception):
pass

class AuthorUrlExtractor:
article_matchers: List[ArticleMatcher]

def __init__(self, website_matchers):
self.article_matchers: List[ArticleMatcher] = website_matchers

def extract_url(self, url: Url) -> Url:
match_count = sum(matcher.matches(url) for matcher in self.article_matchers)
if match_count > 1:
raise Exception(f"More than one matcher: ({match_count}) matched for url: {url}")
if match_count == 0:
raise MatchNotFoundException()
for matcher in self.article_matchers:
if matcher.matches(url):
return matcher.get_author_link(url)


class UrlPatternsMatcher(ArticleMatcher):

def __init__(self, url_patterns: Dict[str, str]):
self.url_patterns = url_patterns
self.matchers: Dict[Pattern[str]] = dict()

def matches(self, url: Url) -> bool:
return


def main():
input_filepath = f"{root_directory()}/data/links/articles_updated.csv"
articles_df = pd.read_csv(input_filepath)

articles = pd.read_csv(input_filepath)
for url in articles.article:
print(url)
return
# Iterate through each article URL, find its website match, and get the equivalent URL
for _, row in articles_df.iterrows():
try:
Expand Down