Bikarhêner:Balyozxane/skrîpt/py/kuCosmeticsCore.py
Xuyakirin
#!/usr/bin/env python3
"""
python pwb.py updatewin -f:"kucosmetics.py" -s:"fix assignNamesToUnnamedRefs"
this is a fork of master cosmetic_changes.py which comes with pywikibot. The script can run standalone using [[Bikarhêner:Balyozxane/skrîpt/py/kuCosmeticsRun.py]] which doesn't edit the page if only whitespace changes are detected.
Added:
fixVrefNames --> renames visual editor ref names.
replaceDeprecatedTemplates --> changes redirected templates to target template using a json file ([[Bikarhêner:Balyozxane/skrîpt/py/listeyasablonan.py]]).
fixOthers --> A few standard changes for ku.wiki
replaceDeprecatedParams --> Uses WP:AutoWikiBrowser/Rename template parameters to replace deprecated/English parameters
removeDupeCats --> Removes dublicate categories
fixAgahidankSpace --> standardizes the number of space characters between Agahîdank templates
removeSelfCat --> removes category from self-categoriezed cats
fixPunctAfterTempl --> Niqeteşaniya piştî şablonên wekî 'çavkanî hewce ye' dixe berî şablonê
fixMainCat --> add main cat from wikidata or create same as page title
"""
import re
import json
import string
import mytools
import requests
import pywikibot
import mwparserfromhell
from enum import IntEnum
from mytools import ucfirst
from pywikibot import textlib
from typing import Any, Union, Tuple
from urllib.parse import urlparse, urlunparse
from pywikibot.backports import Callable, Dict, List, Match, Pattern
from pywikibot.exceptions import InvalidTitleError
from pywikibot.textlib import FILE_LINK_REGEX
from pywikibot.tools import first_lower, first_upper
from pywikibot.tools.chars import url2string
try:
import stdnum.isbn as stdnum_isbn
except ImportError:
stdnum_isbn = None
# Subpage templates. Must be in lower case,
# whereas subpage itself must be case sensitive
moved_links = {
'ku': (['documentation', 'belgekirin'], '/belge'),
}
VERBOSE = False
TESTING = False
class CANCEL(IntEnum):
"""Cancel level to ignore exceptions.
If an error occurred and either skips the page or the method
or a single match. ALL raises the exception.
.. versionadded:: 6.3
"""
ALL = 0
PAGE = 1
METHOD = 2
MATCH = 3
def _format_isbn_match(match: Match[str], strict: bool = True) -> str:
"""Helper function to validate and format a single matched ISBN."""
if not stdnum_isbn:
raise NotImplementedError(
'ISBN functionality not available. Install stdnum package.')
isbn = match['code']
try:
stdnum_isbn.validate(isbn)
except stdnum_isbn.ValidationError as e:
if strict:
raise
pywikibot.log(f'ISBN "{isbn}" validation error: {e}')
return isbn
return stdnum_isbn.format(isbn)
def _reformat_ISBNs(text: str, strict: bool = True) -> str:
"""Helper function to normalise ISBNs in text.
:raises Exception: Invalid ISBN encountered when strict enabled
"""
return textlib.reformat_ISBNs(
text, lambda match: _format_isbn_match(match, strict=strict))
def do_kozmetik(page, text, ignore=CANCEL.MATCH, show_diff=False):
kozmetik_cebu = ""
cc_toolkit = CosmeticChangesToolkit(page,
show_diff=show_diff,
ignore=ignore)
new_text, summaries = cc_toolkit.change(text)
applied_summaries = ', '.join(summaries.values())
if new_text is not False and new_text != text:
kozmetik_cebu = "; paqijiyên kozmetîk"
if applied_summaries:
kozmetik_cebu += f' ({applied_summaries}.)'
return new_text, kozmetik_cebu
class CosmeticChangesToolkit:
"""Cosmetic changes toolkit.
.. versionchanged:: 7.0
`from_page()` method was removed
"""
def __init__(self, page: 'pywikibot.page.BasePage', *,
show_diff: bool = False,
ignore: IntEnum = CANCEL.ALL) -> None:
"""Initializer.
.. versionchanged:: 5.2
instantiate the CosmeticChangesToolkit from a page object;
only allow keyword arguments except for page parameter;
`namespace` and `pageTitle` parameters are deprecated
.. versionchanged:: 7.0
`namespace` and `pageTitle` parameters were removed
:param page: the Page object containing the text to be modified
:param show_diff: show difference after replacements
:param ignore: ignores if an error occurred and either skips the page
or only that method. It can be set one of the CANCEL constants
"""
global VERBOSE
VERBOSE = show_diff
if page.site.sitename != 'wikipedia:ku':
raise ValueError("This script should only be used on ku:wikipedia")
self.site = page.site
self.current_page = page
self.title = page.title()
self.namespace = page.namespace()
self.show_diff = show_diff
self.template = (self.namespace == 10)
self.talkpage = self.namespace >= 0 and self.namespace % 2 == 1
self.ignore = ignore
self.summaries = {}
self.tarix = mytools.get_cur_month_year()
self.is_bekategori = mytools.TagHelpers.is_bekategori(page)
if self.namespace == 0 or TESTING:
self.unhidden_cats = mytools.get_unhidden_categories('ku', self.title)
self.is_disambig = page.isDisambig()
self.gotara_zaravayan = mytools.zaravayen_din(page.categories())
self.sernav_templates = mytools.get_cat_members(self.site, "Şablonên ji bo sererastkirina sernavê rûpelê",
10)
append_sernav = ['Danasîna kurt', 'Bikaranîna dîroka rms']
self.sernav_templates.extend(append_sernav)
self.contains_sewi_cat = mytools.is_category_in_page(page, 'Hemû gotarên sêwî')
self.contains_sitil_cat = mytools.is_category_in_page(page, 'Hemû şitil')
self.is_sewi = mytools.TagHelpers.is_sewi(page)
self.is_sitil = mytools.TagHelpers.is_sitil(page)
self.is_liste = mytools.is_liste(self.site, self.unhidden_cats)
self.common_methods = [
self.removeLtrMark,
self.replaceDeprecatedTemplates,
self.addOrphanTag,
self.removeOrphanTag,
self.addUncatTag,
self.removeUncatTag,
self.fixLead,
self.addStubTag,
self.removeStubTag,
self.fixApostSign,
self.fixNowiki,
self.fixSelfInterwiki,
self.fixMainCat,
self.standardizePageFooter,
self.fixSyntaxSave,
self.cleanUpLinks,
self.cleanUpSectionHeaders,
self.putSpacesInLists,
self.translateAndCapitalizeNamespaces,
self.translateMagicWords,
self.resolveHtmlEntities,
self.removeNonBreakingSpaceBeforePercent,
self.fixHtml,
self.fixReferences,
self.assignNamesToUnnamedRefs,
self.fixVrefNames,
self.fixStyle,
self.fixTypo,
self.fixSectionTitles,
self.replaceDeprecatedParams,
self.removeDupeCats,
self.removeDupeParam,
self.fixAgahidankSpace,
self.removeSelfCat,
self.fixPunctAfterTempl,
self.removeUselessSpaces
]
if stdnum_isbn:
self.common_methods.append(self.fix_ISBN)
# Define the explanation for each method
method_explanations = {
'removeLtrMark': '--U+200E',
'addOrphanTag': '+{{Sêwî}}',
'removeOrphanTag': '--{{Sêwî}}',
'fixLead': 'Destpêkê standard kir',
'addStubTag': '+{{Şitil}}',
'removeStubTag': '--{{Şitil}}',
'addUncatTag': '+{{Bêkategorî}}',
'removeUncatTag': '--{{Bêkategorî}}',
'assignNamesToUnnamedRefs': 'Nav li ref-ê zêde kir',
'fixApostSign': 'Apostrof rast kir',
'fixNowiki': '--<nowiki/>',
'fixSelfInterwiki': '--înterwîkî',
'fix_ISBN': 'ISBN sererast kir',
'fixMainCat': '+Kategoriya sereke',
'standardizePageFooter': 'Binê standard kir',
'fixSyntaxSave': 'Xeletiyên sentaksê rast kir',
'cleanUpLinks': 'Lînk paqij kir',
'cleanUpSectionHeaders': 'Valahiya beşan rast kir',
'putSpacesInLists': '+Valahiya lîsteyan',
'translateAndCapitalizeNamespaces': 'Valahiya nav rast kir',
'translateMagicWords': 'Kelîmeyên sihirî rast kir',
'replaceDeprecatedTemplates': 'Şablonên beralîkirî guhart',
'resolveHtmlEntities': 'HTML rast kir',
'removeUselessSpaces': '--Valahiyên nehewce',
'removeNonBreakingSpaceBeforePercent': '--Valahiya berî sedî',
'fixHtml': 'Xeletiyên HTMLê rast kir',
'fixReferences': 'Ref rast kir',
'fixVrefNames': 'Navên ref-an rast kir',
'fixStyle': 'Stîl rast kir',
'fixTypo': 'Yekeyan rast kir',
'fixSectionTitles': 'Sernavên beşan rast kir',
'replaceDeprecatedParams': 'Parametreyên kevn rast kir',
'removeDupeCats': '--Kategoriya ducarî',
'fixAgahidankSpace': 'Valahiya agahîdankê standard kir',
'removeSelfCat': '--Kategoriya li ser xwe',
'removeDupeParam': '--Parametreya ducarî',
'fixPunctAfterTempl': 'Niqteşanî piştî şablonê rast kir'
}
def safe_execute(self, method: Callable[[str], str], text: str) -> str:
"""Execute the method and catch exceptions if enabled."""
result = None
try:
result = method(text)
except Exception as e:
if self.ignore == CANCEL.METHOD:
pywikibot.warning('Unable to perform "{}" on "{}"!'
.format(method.__name__, self.title))
pywikibot.error(e)
else:
raise
return text if result is None else result
def _check_modification(self, method_name: str, old_text: str, new_text: str) -> None:
"""Check if the text is modified by a method and generate a summary."""
if old_text != new_text:
summary = self.method_explanations.get(method_name, 'sererastkirinên din')
self.summaries[method_name] = summary
def _change(self, text: str) -> str:
"""Execute all clean up methods."""
modified_text = text
for method in self.common_methods:
old_text = modified_text
modified_text = self.safe_execute(method, modified_text)
self._check_modification(method.__name__, old_text, modified_text)
return modified_text
def change(self, text: str) -> Tuple[str, Dict[Any, Any]]:
"""Execute all clean up methods and catch errors if activated."""
try:
new_text = self._change(text)
except Exception as e:
if self.ignore == CANCEL.PAGE:
pywikibot.warning('Skipped "{}", because an error occurred.'
.format(self.title))
pywikibot.error(e)
return "", {} # Return empty string and empty dictionary
raise
else:
# if self.show_diff:
# pywikibot.showDiff(text, new_text)
return new_text, self.summaries
def get_main_cat(self, title: str) -> Union[dict, None]:
"""Get the P910 value from Wikidata for the given page."""
# Construct the Wikidata API URL
wikidata_api_url = 'https://www.wikidata.org/w/api.php'
params = {
'action': 'wbgetentities',
'sites': 'kuwiki',
'titles': title,
'props': 'claims|sitelinks',
'format': 'json'
}
# Make the API request
try:
response = requests.get(wikidata_api_url, params=params)
response.raise_for_status() # Raise an exception for bad responses
except requests.exceptions.RequestException as e:
print(f"Error fetching data from Wikidata: {e}")
return None
data = response.json()
# Check if the response contains the item ID
entities = data.get('entities')
if not entities:
return None
# Extract the item ID
item_id = next(iter(entities))
item_data = entities[item_id]
# Check if the item has the P910 property
claims = item_data.get('claims', {})
P910_claims = claims.get('P910', [])
if not P910_claims:
return None
# Get the target value from the claim
P910_claim = P910_claims[0]
mainsnak = P910_claim.get('mainsnak', {})
datavalue = mainsnak.get('datavalue', {})
value = datavalue.get('value', {})
target_id = value.get('id')
sitelinks = item_data.get('sitelinks', {})
enwiki_page_dict = sitelinks.get('enwiki', None)
# Check if enwiki_page_dict is None
if enwiki_page_dict is None:
return None
enwiki_page = enwiki_page_dict.get('title', None)
if target_id and enwiki_page:
if VERBOSE:
print(f"QID main_Cat: {target_id}")
print(f"enwiki_page for current page: {enwiki_page}")
retr_links = mytools.get_sitelinks_qid(target_id, ['ku', 'en'])
kuwiki_main = retr_links.get('kuwiki')
enwiki_main = retr_links.get('enwiki')
result = {}
if kuwiki_main:
result["kuwiki"] = kuwiki_main
return result
else:
if enwiki_main:
if enwiki_main.replace('Category:', '') == enwiki_page:
result["enwiki"] = enwiki_main
return result
else:
return None
else:
return None
def create_main(self, page, enwiki_page):
new_cat_title = 'Kategorî:' + page.title()
new_cat_page = pywikibot.Page(self.site, new_cat_title)
if new_cat_page.exists():
if VERBOSE:
print('Kategorî jixwe heye. Dev jê berde.')
return None
page_text = '{{subst:bêkategorî}}\n{{standard-kat}}'
page_text += f'\n\n[[en:{enwiki_page}]]'
new_cat_page.text = page_text
summary = f'[[User:Balyozxane/skrîpt/py/kuCosmeticsCore.py|Bot]]: Wekheva [[en:{enwiki_page}]] hat çêkirin'
if not TESTING:
new_cat_page.save(summary=summary)
return pywikibot.Category(self.site, new_cat_title, sort_key=' ')
def fixSectionTitles(self, text: str) -> str:
if self.namespace != 0 and not TESTING:
return text
if self.is_disambig or self.gotara_zaravayan:
return text
replacements = {
r'==\s*[gG]ir[eê]dan[aeêîi]n?\s+[Dd]erv(a|eyî|ê|e)\s*==': '== Girêdanên derve ==',
r'==\s*Erdn[îi]garî?\s*==': '== Erdnîgarî ==',
r'==\s*[Çç]ava?kanî\s*==': '== Çavkanî ==',
r'==\s*[Tt]ûrîzm\s*==': '== Turîzm ==',
r'==\s*[êeÊe]t[iî]m[ao]lo[gj]î\s*==': '== Etîmolojî ==',
r'==\s*[Dd][iî]rok\s*==': '== Dîrok ==',
r'==\s*[bB]in[eê]r[eê] [Jj]î\s*==': '== Binêre herwiha =='
}
for pattern, replacement in replacements.items():
text = re.sub(pattern, replacement, text)
return text
def fixNowiki(self, text: str) -> str:
if self.namespace != 0 and not TESTING:
return text
text = re.sub('\[\[([^]]+)]]<nowiki/>', '[[\\1]]', text)
return text
def fixPunctAfterTempl(self, text: str) -> str:
"""
Replace specified template names with a punctuation mark followed by the template.
:param text: The input wiki text.
:return: The modified wiki text.
"""
if self.namespace != 0 and not TESTING:
return text
# Define punctuation marks
punctuation_marks = [",", ".", ":", ";", "!", "?"]
template_names = ['Çavkanî hewce ye', 'Ne kurdî-biçûk', 'Zelalkirin hewce ye']
# Iterate over template names
for template_name in template_names:
# Define the pattern to match the template followed by punctuation
pattern = rf'\s*{{{{\s*{template_name}([^}}]+)?}}}}([{"".join(punctuation_marks)}])'
# Define the replacement pattern
replacement = f'\\2{{{{{template_name}\\1}}}}'
# Perform the replacement using textlib
text = textlib.replaceExcept(text, pattern, replacement, ['table'])
return text
def removeSelfCat(self, text: str) -> str:
if self.namespace != 14 and not TESTING:
return text
category_links = textlib.getCategoryLinks(text, site=self.site)
# Construct new category links without self.title while preserving sortkeys
new_category_links = []
for category in category_links:
if category.title() != self.title:
sortkey = category.sortKey
if sortkey:
new_category_links.append(f"{category.title()}|{sortkey}")
else:
new_category_links.append(category.title())
# Replace existing categories with new category links
text = textlib.replaceCategoryLinks(text, new_category_links, site=self.site)
return text
def removeDupeCats(self, text: str) -> str:
# Extract categories
categories = textlib.getCategoryLinks(text, self.site)
seen_categories = {}
final_categories = []
# Iterate through categories
for category in categories:
cat_title = category.title()
cat_sortkey = category.sortKey
if cat_title not in seen_categories:
# Record the first occurrence of the category
seen_categories[cat_title] = cat_sortkey
final_categories.append(category)
else:
# Handle duplicate categories
first_sortkey = seen_categories[cat_title]
if not first_sortkey and not cat_sortkey:
# Skip the current category as it is a duplicate without a sortkey
continue
# If the current category has a sortkey, we keep it and replace the first occurrence
# if the first occurrence does not have a sortkey
if not first_sortkey and cat_sortkey:
# Replace the first occurrence with the current one
final_categories = [cat for cat in final_categories if cat.title() != cat_title]
final_categories.append(category)
# Update the seen_categories with the new sortkey
seen_categories[cat_title] = cat_sortkey
# Replace the categories in the text
text = textlib.replaceCategoryLinks(text, final_categories, site=self.site)
return text
def removeDupeParam(self, text: str) -> str:
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
params_seen = set()
for param in template.params:
param_name = str(param.name).strip()
if param_name in params_seen and (not param.value.strip()): # Check for empty values
template.remove(param)
else:
params_seen.add(param_name)
text = str(wikicode)
return text
def replaceDeprecatedParams(self, text: str) -> str:
with open('parambikejson.json', encoding='utf-8') as f:
alias_dict = json.load(f)
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
template_name = ucfirst(template.name)
# Check if the capitalized template name is in alias_dict
if template_name in alias_dict:
params_to_replace = alias_dict[template_name]
# Loop through each parameter in the template
for param in template.params:
param_name = param.name.strip()
# Check if the parameter name needs replacing
if param_name in params_to_replace:
new_param_name = params_to_replace[param_name]
param.name = new_param_name
text = str(wikicode)
return text
def fixAgahidankSpace(self, text: str) -> str:
if self.namespace != 0 and not TESTING:
return text
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
template_name = ucfirst(template.name)
if template_name.startswith("Agahîdank"):
if template.params:
# Iterate over the parameters and format them
for param in template.params:
# Calculate space padding based on the length of the parameter name
param_name_length = len(param.name.strip())
if param_name_length <= 17:
space_padding = " " * (18 - param_name_length) + " "
else:
space_padding = " "
# Add a line break after each parameter value
param.value = " " + param.value.strip() + "\n"
# Update parameter name with padding
param.name = " {}{}".format(param.name.strip(), space_padding)
# Add a line break after the template name
template.name = template.name.strip() + "\n"
else:
# Handle the case where there are no parameters in the template
pass
return str(wikicode)
def removeLtrMark(self, text: str) -> str:
"""
Removes all occurrences of the Left-to-Right Mark (U+200E) from the input string.
:param text: The string to process.
:return: A new string with all LTR marks removed.
"""
return text.replace('\u200e', '')
def replaceDeprecatedTemplates(self, text: str) -> str:
"""
Renames redirected templates from redirected_template_mappings.json for kuwiki
"""
# Load JSON file containing template name mappings
with open('redirected_template_mappings.json', encoding='utf-8') as f:
template_mappings = json.load(f)
wikicode = mwparserfromhell.parse(text)
# Iterate over each template in the parsed text
for template in wikicode.filter_templates():
old_name = ucfirst(template.name)
# Check if the template name exists in the JSON mappings
if old_name in template_mappings:
new_name = template_mappings[old_name]["rd_title"] # Get the new template name
# Find the position of the old template name in template.name
start_index = template.name.lower().find(old_name.lower())
# Replace the old template name with the new one in template.name
template.name = (
template.name[:start_index] + new_name + template.name[start_index + len(old_name):]
)
return str(wikicode)
def assignNamesToUnnamedRefs(self, text: str) -> str:
"""
This function assigns names to unnamed <ref> tags by checking for duplicate ref values
and giving them names like :0, :1, etc.
Parameters:
text (str): The input wikitext containing <ref> tags.
Returns:
str: The modified wikitext with named <ref> tags.
"""
parsed = mwparserfromhell.parse(text)
# tags = list(parsed.ifilter(forcetype=mwparserfromhell.wikicode.Tag, matches="<\\s*ref\\s*", recursive=True))
tags = list()
for tag in parsed.filter_tags(recursive=True):
if tag.tag == "ref":
tags.append(tag)
# Dictionary to keep track of ref contents and their assigned names
ref_contents_to_names = {}
# Counter for generating unique names
counter = 0
# Dictionary to track counts of ref contents
ref_content_counts = {}
# Set to track existing ref names
existing_names = set()
i = 1
# Collect existing ref names and map ref contents to names
for tag in tags:
#print(f"-----\ntag {i}:\n{str(tag)}\n------")
i += 1
if tag.has("name"):
name = tag.get("name").value.strip()
ref_content = str(tag.contents).strip()
existing_names.add(name)
ref_contents_to_names[ref_content] = name
# First pass: Identify unnamed refs and initialize ref_content_counts
for tag in tags:
if not tag.has("name"):
ref_content = str(tag.contents).strip()
if ref_content in ref_content_counts:
ref_content_counts[ref_content] += 1
else:
ref_content_counts[ref_content] = 1
# Set to track if a ref content has been named for the first time
first_occurrence_named = set()
# Second pass: Rename tags and handle duplicates
for tag in tags:
if not tag.has("name"):
ref_content = str(tag.contents).strip()
if ref_content in ref_contents_to_names:
# If the content matches a named reference, use the existing name
new_tag = mwparserfromhell.nodes.tag.Tag("ref", self_closing=True)
new_tag.add("name", ref_contents_to_names[ref_content])
parsed.replace(tag, new_tag)
else:
if ref_content_counts[ref_content] > 1:
if ref_content not in first_occurrence_named:
# Assign a unique name to the first occurrence of the duplicate content
while f":{counter}" in existing_names:
counter += 1
new_name = f":{counter}"
ref_contents_to_names[ref_content] = new_name
tag.add("name", new_name)
first_occurrence_named.add(ref_content)
existing_names.add(new_name)
else:
# For subsequent occurrences, create a self-closing ref tag with the assigned name
new_tag = mwparserfromhell.nodes.tag.Tag("ref", self_closing=True)
new_tag.add("name", ref_contents_to_names[ref_content])
parsed.replace(tag, new_tag)
else:
# If it's a unique unnamed ref, we do nothing
pass
return str(parsed)
def fixVrefNames(self, text: str) -> str:
"""
taken from [[:en:User:Qwerfjkl/VEref.py]] which is itself taken
from [[:en:User:Psiĥedelisto/VisualEditor ref namer.py]]
The VisualEditor, (very annoyingly!), doesn't name references added by users, and gives them names like :0, :1, etc. This script fixes that automatically
Changes some lower case template names to upper and vice versa
"""
if self.namespace != 0 and not TESTING:
return text
parsed = mwparserfromhell.parse(text)
tags = list(filter(None, [t if t.has("name") else None for t in
parsed.ifilter(forcetype=mwparserfromhell.wikicode.Tag, matches="<\\s*ref\\s*",
recursive=True)]))
# list of existing ref names, we will compare to make sure we dont create the same one again
ref_names = [tag.get("name").value for tag in tags]
# find the list of numbered ref tags
refs = list(
filter(lambda s: re.search("^:\d+$", str(s.get("name").value)) and not re.search("/>$", str(s)), tags))
pretty = dict()
for ref in refs:
template = ref.contents.get(0)
if not isinstance(template, mwparserfromhell.nodes.Template): # Check if template is a Template object
continue
if template.has("vauthors"):
v = str(template.get("vauthors").value)
elif template.has("authors"):
v = str(template.get("authors").value)
elif template.has("paşnav"):
v = str(template.get("paşnav").value)
elif template.has("pêşnav"):
v = str(template.get("pêşnav").value)
else:
continue
v = v.strip()
if "," in v:
last = v[:v.index(",")]
elif " " in v:
last = v[:v.index(" ")]
else:
last = v
punctuation = set(string.punctuation)
# Strip punctuation characters from the last word directly
last = ''.join([char for char in last if char not in punctuation])
if re.match(r'^[0-9\-.,]+$', last):
last = False
else:
# Check if the last name contains Latin alphabet characters
latin_alphabet = set(string.ascii_letters)
if not any(char in latin_alphabet for char in last):
last = False
date = False
if template.has("tarîx"):
date = str(template.get("tarîx").value)
elif template.has("dîrok"):
date = str(template.get("dîrok").value)
elif template.has("sal"):
date = str(template.get("sal").value)
if date and last:
match = re.search('\d{4}', date)
if match:
date = match[0]
new_name = "{}{}".format(last, date)
# Ensure there are no duplicate values in pretty and new_name does not exist in the current text using ref_names
if new_name not in pretty.values() and new_name not in ref_names:
pretty[str(ref.get("name").value)] = new_name
if not pretty:
return text
if VERBOSE:
print("pretty:", pretty)
for tag in parsed.ifilter(forcetype=mwparserfromhell.wikicode.Tag, matches="<\\s*ref\\s*", recursive=True):
if not tag.has("name"):
continue
k = str(tag.get("name").value)
if k in pretty:
tag.attributes[0].value = pretty[k]
return str(parsed)
def fixSelfInterwiki(self, text: str) -> str:
"""
Interwiki links to the site itself are displayed like local links.
Remove their language code prefix.
"""
if not self.talkpage and pywikibot.calledModuleName() != 'interwiki':
interwikiR = re.compile(r'\[\[(?: *:)? *{} *: *([^\[\]\n]*)]]'
.format(self.site.code))
text = interwikiR.sub(r'[[\1]]', text)
return text
def fixMainCat(self, text: str) -> str:
"""
Retrieve the main category from wikidata or create it if need be
"""
assert self.title is not None
if self.namespace != 0 and not TESTING:
return text
if self.is_disambig or self.gotara_zaravayan:
return text
categories = textlib.getCategoryLinks(text, site=self.site)
new_text = text
if categories:
main = pywikibot.Category(self.site, 'Category:' + self.title,
sort_key=' ')
if main in categories:
return text
# Get main categories from Wikidata
maincats = self.get_main_cat(self.title)
if maincats:
if VERBOSE:
print(f"maincats: {maincats}")
kuwiki_link = maincats.get('kuwiki')
enwiki_link = maincats.get('enwiki')
if kuwiki_link:
if VERBOSE:
print(f"kuwiki Main cat found: {kuwiki_link}")
main = pywikibot.Category(self.site, kuwiki_link, sort_key=' ')
if main in categories:
categories.pop(categories.index(main))
categories.insert(0, main)
new_text = textlib.replaceCategoryLinks(text, categories,
site=self.site)
if ''.join(text.split()) != ''.join(new_text.split()):
return new_text
else:
return text
def standardizeFooterTemplates(self, text: str) -> str:
if self.namespace != 0 and not TESTING:
return text
if self.is_disambig or self.gotara_zaravayan:
return text
# Find and remove other templates
sitil_template_regex = r'{{\s*([^\}]+\-şitil|[Şş]iti?l|[Kk]urt|[Ss]tub|[Şş]itlek|[^\}]+\-şitil\-[^\}]+)\s*}}'
sitil_templates = re.findall(sitil_template_regex, text)
text = re.sub(sitil_template_regex, '', text)
template_sitil_regex = r'{{\s*([Şş]itil-[^\}]+)\s*}}'
template_sitil = re.findall(template_sitil_regex, text)
text = re.sub(template_sitil_regex, '', text)
# Find and remove DEFAULTSORT
defaultsort_regex = r'{{\s*(DEFAULTSORT:[^}]+|Salê kat bike heke sal hebe)\s*}}'
defaultsort = re.findall(defaultsort_regex, text)
defaultsort = defaultsort[0] if defaultsort else ""
text = re.sub(defaultsort_regex, '', text)
kontrol_oto_regex = r'\{\{\s*([kK]ontrola otorîtey?ê?|[aA]uthority control|Kontrola otorîte)\s*}}'
kontrol_oto_templ = re.findall(kontrol_oto_regex, text)
text = re.sub(kontrol_oto_regex, '', text)
# Find and remove koord display=title template
koord_regex = r'{{\s*([Kk]oord|[Cc]oord)\s*\|\s*([^}]+display\s*=\s*title)\s*}}'
koord_templates = [match[1] for match in re.findall(koord_regex, text)]
text = re.sub(koord_regex, '', text)
updated_text = text
if len(kontrol_oto_templ) > 0:
updated_text = textlib.add_text(updated_text, "\n{{Kontrola otorîteyê}}", site=self.site)
if len(template_sitil) > 0:
add_template_sitil = '\n'.join('{{' + template_s + '}}' for template_s in template_sitil)
updated_text = textlib.add_text(updated_text, add_template_sitil, site=self.site)
if len(sitil_templates) > 0:
add_sitil_templates = '\n'.join('{{' + template + '}}' for template in sitil_templates)
updated_text = textlib.add_text(updated_text, add_sitil_templates, site=self.site)
if len(koord_templates) > 0:
add_koord_templates = '\n'.join('{{Koord|' + koord_template + '}}' for koord_template in koord_templates)
updated_text = textlib.add_text(updated_text, add_koord_templates, site=self.site)
if len(defaultsort) > 0:
add_defaultsort = '\n' + '{{' + defaultsort + '}}'
updated_text = textlib.add_text(updated_text, add_defaultsort, site=self.site)
# Remove empty lines at the end of the page
updated_text = re.sub(r'\n\n+', '\n\n', updated_text)
return updated_text
def standardizePageFooter(self, text: str) -> str:
"""
Standardize page footer.
Makes sure that interwiki links and categories are put
into the correct position and into the right order. This
combines the old instances of standardizeInterwiki
and standardizeCategories.
The page footer consists of the following parts
in that sequence:
1. categories
2. additional information depending on the local site policy
3. interwiki
"""
assert self.title is not None
categories = []
interwiki_links = {}
# get categories
if not self.template:
categories = textlib.getCategoryLinks(text, site=self.site)
subpage = False
if not self.talkpage:
if self.template:
try:
tmpl, loc = moved_links[self.site.code]
del tmpl
except KeyError:
loc = None
if loc is not None and loc in self.title:
subpage = True
# get interwiki
interwiki_links = textlib.getLanguageLinks(
text, insite=self.site, template_subpage=subpage)
# remove interwiki
text = textlib.removeLanguageLinks(text, site=self.site)
if self.namespace == 0 or TESTING:
text = self.standardizeFooterTemplates(text)
# add categories, main to top
if categories:
main = pywikibot.Category(self.site, 'Category:' + self.title,
sort_key=' ')
if main in categories:
categories.pop(categories.index(main))
categories.insert(0, main)
# Sort categories in alphabetic order
def kurdish_sort_key(category):
# Assign each character in the category name its index in the Kurdish alphabet
kurdish_alphabet = "abccçdeêfghiîjklmnopqrsştuûvwxyzABCCÇDEÊFGHIÎJKLMNOPQRSŞTUÛVWXYZ"
category_title = category.title()
return tuple(
kurdish_alphabet.index(c) if c in kurdish_alphabet else float('inf') for c in category_title)
categories.sort(key=kurdish_sort_key)
text = textlib.replaceCategoryLinks(text, categories,
site=self.site)
# add interwiki
if interwiki_links:
text = textlib.replaceLanguageLinks(text, interwiki_links,
site=self.site,
template=self.template,
template_subpage=subpage)
return text
def translateAndCapitalizeNamespaces(self, text: str) -> str:
"""Use localized namespace names.
.. versionchanged:: 7.4
No longer expect a specific namespace alias for File:
"""
# wiki links aren't parsed here.
exceptions = ['nowiki', 'comment', 'math', 'pre']
for namespace in self.site.namespaces.values():
if namespace == 0:
# skip main (article) namespace
continue
# a clone is needed. Won't change the namespace dict
namespaces = list(namespace)
# final namespace variant
final_ns = namespaces.pop(0)
if namespace in (2, 3):
# skip localized user namespace, maybe gender is used
namespaces = ['User' if namespace == 2 else 'User talk']
# lowerspaced and underscored namespaces
for i, item in enumerate(namespaces):
item = item.replace(' ', '[ _]')
item = f'[{item[0]}{item[0].lower()}]' + item[1:]
namespaces[i] = item
namespaces.append(first_lower(final_ns))
if final_ns and namespaces:
text = textlib.replaceExcept(
text,
r'\[\[\s*({}) *:(?P<nameAndLabel>.*?)\]\]'
.format('|'.join(namespaces)),
fr'[[{final_ns}:\g<nameAndLabel>]]',
exceptions)
return text
def translateMagicWords(self, text: str) -> str:
"""Use localized magic words."""
def init_cache() -> None:
for magicword in ('img_thumbnail', 'img_left', 'img_center',
'img_right', 'img_none', 'img_framed',
'img_frameless', 'img_border', 'img_upright',
'img_baseline', 'img_sub', 'img_super',
'img_top', 'img_text_top', 'img_middle',
'img_bottom', 'img_text_bottom'):
aliases = self.site.getmagicwords(magicword)
if len(aliases) > 1:
cache.update((alias, aliases[0]) for alias in aliases[1:]
if '$1' not in alias)
if not cache:
cache[False] = True # signal there is nothing to replace
def replace_magicword(match: Match[str]) -> str:
if cache.get(False):
return match.group()
split = match.group().split('|')
if len(split) == 1:
return match.group()
if not cache:
init_cache()
# push ']]' out and re-add below
split[-1] = split[-1][:-2]
return '{}|{}]]'.format(
split[0], '|'.join(cache.get(x.strip(), x) for x in split[1:]))
cache: Dict[Union[bool, str], Any] = {}
exceptions = ['comment', 'nowiki', 'pre', 'syntaxhighlight']
regex = re.compile(
FILE_LINK_REGEX % '|'.join(self.site.namespaces[6]),
flags=re.X)
return textlib.replaceExcept(
text, regex, replace_magicword, exceptions)
def cleanUpLinks(self, text: str) -> str:
"""Tidy up wikilinks found in a string.
This function will:
* Replace underscores with spaces
* Move leading and trailing spaces out of the wikilink and into the
surrounding text
* Convert URL-encoded characters into Unicode-encoded characters
* Move trailing characters out of the link and make the link without
using a pipe, if possible
* Capitalize the article title of the link, if appropriate
.. versionchanged:: 8.4
Convert URL-encoded characters if a link is an interwiki link
or different from main namespace.
:param text: string to perform the clean-up on
:return: text with tidied wikilinks
"""
# helper function which works on one link and either returns it
# unmodified, or returns a replacement.
def handleOneLink(match: Match[str]) -> str:
# Convert URL-encoded characters to str
titleWithSection = url2string(match['titleWithSection'],
encodings=self.site.encodings())
label = match['label']
trailingChars = match['linktrail']
newline = match['newline']
# entire link but convert URL-encoded text
oldlink = url2string(match.group(),
encodings=self.site.encodings())
is_interwiki = self.site.isInterwikiLink(titleWithSection)
if is_interwiki:
return oldlink
# The link looks like this:
# [[page_title|link_text]]trailing_chars
# We only work on namespace 0 because pipes and linktrails work
# differently for images and categories.
page = pywikibot.Page(pywikibot.Link(titleWithSection, self.site))
try:
in_main_namespace = page.namespace() == 0
except InvalidTitleError:
in_main_namespace = False
if not in_main_namespace:
return oldlink
# Replace underlines by spaces, also multiple underlines
titleWithSection = re.sub('_+', ' ', titleWithSection)
# Remove double spaces
titleWithSection = re.sub(' +', ' ', titleWithSection)
# Remove unnecessary leading spaces from title,
# but remember if we did this because we eventually want
# to re-add it outside of the link later.
titleLength = len(titleWithSection)
titleWithSection = titleWithSection.lstrip()
hadLeadingSpaces = len(titleWithSection) != titleLength
hadTrailingSpaces = False
# Remove unnecessary trailing spaces from title,
# but remember if we did this because it may affect
# the linktrail and because we eventually want to
# re-add it outside of the link later.
if not trailingChars:
titleLength = len(titleWithSection)
titleWithSection = titleWithSection.rstrip()
hadTrailingSpaces = len(titleWithSection) != titleLength
if not titleWithSection:
# just skip empty links.
return match.group()
# Remove unnecessary initial and final spaces from label.
# Please note that some editors prefer spaces around pipes.
# (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
if label is not None:
# Remove unnecessary leading spaces from label,
# but remember if we did this because we want
# to re-add it outside of the link later.
labelLength = len(label)
label = label.lstrip()
hadLeadingSpaces = len(label) != labelLength
# Remove unnecessary trailing spaces from label,
# but remember if we did this because it affects
# the linktrail.
if not trailingChars:
labelLength = len(label)
label = label.rstrip()
hadTrailingSpaces = len(label) != labelLength
else:
label = titleWithSection
if trailingChars:
label += trailingChars
if self.site.siteinfo['case'] == 'first-letter':
firstcase_title = first_lower(titleWithSection)
firstcase_label = first_lower(label)
else:
firstcase_title = titleWithSection
firstcase_label = label
if firstcase_label == firstcase_title:
newLink = f'[[{label}]]'
# Check if we can create a link with trailing characters
# instead of a pipelink
elif (firstcase_label.startswith(firstcase_title)
and trailR.sub('', label[len(titleWithSection):]) == ''):
newLink = '[[{}]]{}'.format(label[:len(titleWithSection)],
label[len(titleWithSection):])
else:
# Try to capitalize the first letter of the title.
# Not useful for languages that don't capitalize nouns.
# TODO: Add a configuration variable for each site,
# which determines if the link target is written in
# uppercase
if self.site.sitename == 'wikipedia:de':
titleWithSection = first_upper(titleWithSection)
newLink = f'[[{titleWithSection}|{label}]]'
# re-add spaces that were pulled out of the link.
# Examples:
# text[[ title ]]text -> text [[title]] text
# text[[ title | name ]]text -> text [[title|name]] text
# text[[ title |name]]text -> text[[title|name]]text
# text[[title| name]]text -> text [[title|name]]text
if hadLeadingSpaces and not newline:
newLink = ' ' + newLink
if hadTrailingSpaces:
newLink += ' '
if newline:
newLink = newline + newLink
return newLink
trailR = re.compile(self.site.linktrail())
# The regular expression which finds links. Results consist of four groups:
# group <newline> depends whether the links starts with a new line.
# group <titleWithSection> is the page title and section, that is,
# everything before | or ]. It'll include the # to make life easier for us.
# group <label> is the alternative link title between | and ].
# group <linktrail> is the link trail after ]] which are part of the word.
# note that the definition of 'letter' varies from language to language.
linkR = re.compile(
r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)'
r'(\|(?P<label>[^]|]*))?]](?P<linktrail>'
+ self.site.linktrail() + ')')
text = textlib.replaceExcept(text, linkR, handleOneLink,
['comment', 'math', 'nowiki', 'pre',
'startspace'])
return text
def resolveHtmlEntities(self, text: str) -> str:
"""Replace HTML entities with string."""
ignore = [
38, # Ampersand (&)
39, # Single quotation mark (") per T26093
60, # Less than (<)
62, # Greater than (>)
91, # Opening square bracket ([)
# - sometimes used intentionally inside links
93, # Closing square bracket (])
# - used intentionally inside links
124, # Vertical bar (|)
# - used intentionally in navigation bar templates on w:de
160, # Non-breaking space ( )
# - not supported by Firefox textareas
173, # Soft-hypen (­) - enable editing
8206, # Left-to-right mark (<r;)
8207, # Right-to-left mark (&rtl;)
]
if self.template:
ignore.append(32) # Space ( )
ignore.append(58) # Colon (:)
# TODO: T254350 - what other extension tags should be avoided?
# (graph, math, score, timeline, etc.)
text = pywikibot.html2unicode(
text, ignore=ignore, exceptions=['comment', 'syntaxhighlight'])
return text
def removeUselessSpaces(self, text: str) -> str:
"""Cleanup multiple or trailing spaces."""
exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight',
'startspace', 'table', 'template', 'timeline']
text = textlib.replaceExcept(text, r'(?m)[\t ]+( |$)', r'\1',
exceptions, site=self.site)
text = textlib.replaceExcept(text, r'\n\n\n*', r'\n\n',
exceptions, site=self.site)
# text = textlib.replaceExcept(text, r'\n +', r'\n',
# exceptions, site=self.site)
return text
def removeNonBreakingSpaceBeforePercent(self, text: str) -> str:
"""
Remove a non-breaking space between number and percent sign.
Newer MediaWiki versions automatically place a non-breaking space in
front of a percent sign, so it is no longer required to place it
manually.
"""
text = textlib.replaceExcept(
text, r'(\d)&(?:nbsp|#160|#x[Aa]0);%', r'\1 %', ['timeline'])
return text
def cleanUpSectionHeaders(self, text: str) -> str:
"""
Add a space between the equal signs and the section title.
Example::
==Section title==
becomes::
== Section title ==
.. note:: This space is recommended in the syntax help on the
English and German Wikipedias. It is not wanted on Lojban and
English Wiktionaries (:phab:`T168399`, :phab:`T169064`) and
it might be that it is not wanted on other wikis. If there
are any complaints, please file a bug report.
"""
return textlib.replaceExcept(
text,
r'(?m)^(={1,6})[ \t]*(?P<title>.*[^\s=])[ \t]*\1[ \t]*\r?\n',
r'\1 \g<title> \1\n',
['comment', 'math', 'nowiki', 'pre'])
def putSpacesInLists(self, text: str) -> str:
"""
Add a space between the * or # and the text.
.. note:: This space is recommended in the syntax help on the
English, German and French Wikipedias. It might be that it
is not wanted on other wikis. If there are any complaints,
please file a bug report.
"""
if not self.template:
exceptions = ['comment', 'math', 'nowiki', 'pre',
'syntaxhighlight', 'template', 'timeline',
self.site.redirect_regex]
text = textlib.replaceExcept(
text,
r'(?m)'
r'^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)',
r'\g<bullet> \g<char>',
exceptions)
return text
# from fixes.py
def fixSyntaxSave(self, text: str) -> str:
"""Convert weblinks to wikilink, fix link syntax.
"""
def replace_link(match: Match[str]) -> str:
"""Create a string to replace a single link."""
replacement = '[['
if re.match(r'(?:{}):'
.format('|'.join((*self.site.namespaces[6],
*self.site.namespaces[14]))),
match['link']):
replacement += ':'
link = match['link']
if link.endswith('/'):
print('url ends with /')
link = re.sub('/$', '', match['link'])
replacement += link
if match['title']:
replacement += '|' + match['title']
return replacement + ']]'
exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace',
'syntaxhighlight']
# link to the wiki working on
# Only use suffixes for article paths
for suffix in self.site._interwiki_urls(True):
http_url = self.site.base_url(suffix, 'http')
if self.site.protocol() == 'http':
https_url = None
else:
https_url = self.site.base_url(suffix, 'https')
# compare strings without the protocol, if they are empty support
# also no prefix (//en.wikipedia.org/…)
http = urlparse(http_url)
https = urlparse(https_url)
if https_url is not None and http.netloc == https.netloc:
urls = ['(?:https?:)?'
+ re.escape(urlunparse(('', *http[1:])))]
else:
urls = [re.escape(url) for url in (http_url, https_url)
if url is not None]
for url in urls:
# unescape {} placeholder
url = url.replace(r'\{\}', '{title}')
# Only include links which don't include the separator
# as the wikilink won't support additional parameters
separator = '?&' if '?' in suffix else '?'
# Match first a non space in the title to prevent that multiple
# spaces at the end without title will be matched by it
title_regex = (r'(?P<link>[^{sep}]+?)'
r'(\s+(?P<title>[^\s].*?))'
.format(sep=separator))
url_regex = fr'\[\[?{url}?\s*\]\]?'
text = textlib.replaceExcept(
text,
url_regex.format(title=title_regex),
replace_link, exceptions, site=self.site)
# external link in/starting with double brackets
text = textlib.replaceExcept(
text,
r'\[\[(?P<url>https?:https://[^\]]+?)\]\]?',
r'[\g<url>]', exceptions, site=self.site)
# external link and description separated by a pipe, with
# whitespace in front of the pipe, so that it is clear that
# the dash is not a legitimate part of the URL.
text = textlib.replaceExcept(
text,
r'\[(?P<url>https?:https://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]',
r'[\g<url> \g<label>]', exceptions)
# dash in external link, where the correct end of the URL can
# be detected from the file extension. It is very unlikely that
# this will cause mistakes.
extensions = [fr'\.{ext}'
for ext in ['pdf', 'html?', 'php', 'aspx?', 'jsp']]
text = textlib.replaceExcept(
text,
r'\[(?P<url>https?:https://[^\|\] ]+?(' + '|'.join(extensions) + r')) *'
r'\| *(?P<label>[^\|\]]+?)\]',
r'[\g<url> \g<label>]', exceptions)
return text
def fixHtml(self, text: str) -> str:
"""Relace html markups with wikitext markups."""
def replace_header(match: Match[str]) -> str:
"""Create a header string for replacing."""
depth = int(match[1])
return r'{0} {1} {0}'.format('=' * depth, match[2])
# Everything case-insensitive (?i)
# Keep in mind that MediaWiki automatically converts <br> to <br />
exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace',
'syntaxhighlight']
text = textlib.replaceExcept(text, r'(?i)<(b|strong)>(.*?)</\1>',
r"'''\2'''", exceptions, site=self.site)
text = textlib.replaceExcept(text, r'(?i)<(i|em)>(.*?)</\1>',
r"''\2''", exceptions, site=self.site)
# horizontal line without attributes in a single line
text = textlib.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])',
r'\1----\2', exceptions)
# horizontal line with attributes; can't be done with wiki syntax
# so we only make it XHTML compliant
text = textlib.replaceExcept(text, r'(?i)<hr ([^>/]+?)>',
r'<hr \1 />',
exceptions)
# a header where only spaces are in the same line
text = textlib.replaceExcept(
text,
r'(?i)(?<=[\r\n]) *<h([1-7])> *([^<]+?) *</h\1> *(?=[\r\n])',
replace_header,
exceptions)
# TODO: maybe we can make the bot replace <p> tags with \r\n's.
return text
def fixReferences(self, text: str) -> str:
"""Fix references tags."""
# See also
# https://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm
if self.namespace != 0 and not TESTING:
return text
exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight',
'startspace']
# it should be name = " or name=" NOT name ="
text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text)
# Replace <ref name=Penny1p16> with <ref name="Penny1p16">, only if not already quoted
text = textlib.replaceExcept(text,
r'(?i)<ref +name *= *([^\"\/ >]+) *>',
r'<ref name="\1">', exceptions)
# Replace <ref name=Penny1p16 /> with <ref name="Penny1p16" />, only if not already quoted
text = textlib.replaceExcept(text,
r'(?i)<ref +name *= *([^\" >]+) */>',
r'<ref name="\1"/>', exceptions)
# remove empty <ref/>-tag
text = textlib.replaceExcept(text,
r'(?i)(<ref\s*/>|<ref *>\s*</ref>)',
r'', exceptions)
text = textlib.replaceExcept(text,
r'</ref>[ ]*<ref>',
r'</ref><ref>', exceptions)
return text
def fixStyle(self, text: str) -> str:
"""Convert prettytable to wikitable class."""
if self.namespace != 0 and not TESTING:
return text
exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace',
'syntaxhighlight']
text = textlib.replaceExcept(text,
r'(class="[^"]*)prettytable([^"]*")',
r'\1wikitable\2', exceptions)
return text
def fixTypo(self, text: str) -> str:
"""Fix units."""
if self.namespace != 0 and not TESTING:
return text
exceptions: List[Union[str, Pattern[str]]] = [
'comment',
'gallery',
'hyperlink',
'interwiki',
'link',
'nowiki',
'math',
'pre',
'startspace',
'syntaxhighlight',
]
# change <number> ccm -> <number> cm³
text = textlib.replaceExcept(text, r'(\d)\s*(?: )?ccm',
r'\1 cm³', exceptions,
site=self.site)
# Solve wrong Nº sign with °C or °F
# additional exception requested on fr-wiki for this stuff
pattern = re.compile('«.*?»')
exceptions.append(pattern)
text = textlib.replaceExcept(text, r'(\d)\s*(?: )?[º°]([CF])',
r'\1 °\2', exceptions,
site=self.site)
text = textlib.replaceExcept(text, 'º([CF])', '°' + r'\1',
exceptions,
site=self.site)
return text
def fix_ISBN(self, text: str) -> str:
"""Hyphenate ISBN numbers."""
return _reformat_ISBNs(text, strict=self.ignore != CANCEL.MATCH)
def fixLead(self, text: str) -> str:
if self.namespace != 0 and not TESTING:
return text
cudakirin_templates = mytools.get_cat_members(self.site, "Şablon (cudakirin)", 10)
hisyarde_templates = mytools.get_cat_members(self.site, "Şablonên hişyarde", 10)
cleanup_templates = mytools.get_cat_members(self.site, "Şablonên hişyarde ji bo gotaran", 10)
if "Çend problem" in cleanup_templates:
cleanup_templates.remove("Çend problem")
wikicode = mwparserfromhell.parse(text)
sections = wikicode.get_sections(include_lead=True)
lead_section = sections[0]
if VERBOSE:
print("lead_section:\n", lead_section)
existing_problems = None
removed_cleanup_templates = []
removed_hisyarde_templates = []
removed_agahidank_templates = []
removed_sernav_templates = []
removed_cuda_templates = []
child_agahidanks = []
def is_child_agahidank(section):
for parent_templ in section.filter_templates(recursive=True):
if ucfirst(parent_templ.name).startswith("Agahîdank"):
for param in parent_templ.params:
# Check if the parameter value contains the child template
for tpl in param.value.filter_templates():
if ucfirst(tpl.name).startswith("Agahîdank"):
child_agahidanks.append(ucfirst(tpl.name))
return child_agahidanks
child_agahidanks = is_child_agahidank(lead_section)
for template in lead_section.filter_templates():
if ucfirst(template.name) == "Çend problem":
if template.has(1):
existing_problems = str(template.get(1).value).strip()
lead_section.remove(template)
for template in lead_section.filter_templates():
template_name = ucfirst(template.name)
if template_name in cleanup_templates:
removed_cleanup_templates.append(template)
lead_section.remove(template)
if template_name in hisyarde_templates:
removed_hisyarde_templates.append(template)
lead_section.remove(template)
if template_name in self.sernav_templates:
removed_sernav_templates.append(template)
lead_section.remove(template)
if ucfirst(template.name) in cudakirin_templates:
removed_cuda_templates.append(template)
lead_section.remove(template)
if template_name.startswith("Agahîdank"):
if template_name in child_agahidanks:
continue
else:
removed_agahidank_templates.append(template)
lead_section.remove(template)
lead_section = str(lead_section).lstrip()
new_lead_section = lead_section
if removed_agahidank_templates:
readding_agahidank = "\n".join([str(template) for template in removed_agahidank_templates])
new_lead_section = readding_agahidank + "\n" + new_lead_section
readding_cleanup = ""
if existing_problems:
readding_cleanup += existing_problems.strip() + "\n"
if removed_cleanup_templates:
readding_cleanup += "\n".join([str(template) for template in removed_cleanup_templates])
if readding_cleanup.strip():
if existing_problems:
new_template = mwparserfromhell.nodes.Template("Çend problem")
new_template.add(1, "\n" + readding_cleanup + "\n")
new_lead_section = str(new_template) + "\n" + new_lead_section
elif not existing_problems and len(removed_cleanup_templates) > 2:
readding_cleanup = readding_cleanup
new_template = mwparserfromhell.nodes.Template("Çend problem")
new_template.add(1, "\n" + readding_cleanup + "\n")
new_lead_section = str(new_template) + "\n" + new_lead_section
else:
new_lead_section = readding_cleanup + "\n" + new_lead_section
if removed_hisyarde_templates:
readding_hisyarde = "\n".join([str(template) for template in removed_hisyarde_templates])
new_lead_section = readding_hisyarde + "\n" + new_lead_section
if removed_cuda_templates:
# Concatenate first_val with removed_templates
concatenated_val = "\n".join([str(template) for template in removed_cuda_templates])
new_lead_section = concatenated_val + "\n" + new_lead_section
if removed_sernav_templates:
readding_sernav = "\n".join([str(template) for template in removed_sernav_templates])
new_lead_section = readding_sernav + "\n" + new_lead_section
# Replace the lead section in the original wikicode object
wikicode.replace(sections[0], new_lead_section)
new_text = str(wikicode).lstrip()
return new_text
def addOrphanTag(self, text: str) -> str:
if self.namespace != 0:
return text
if self.is_disambig or self.gotara_zaravayan:
return text
if self.contains_sewi_cat:
return text
if not self.is_sewi:
return text
text = text.lstrip()
text = "{{Sêwî|tarîx=" + self.tarix + "}}\n" + text
return text
def removeOrphanTag(self, text: str) -> str:
if self.namespace != 0:
return text
if self.is_disambig or self.gotara_zaravayan:
return text
if not self.contains_sewi_cat:
return text
if self.is_sewi:
return text
text = mytools.remove_template(text, "Sêwî")
return text
def addStubTag(self, text: str) -> str:
if self.namespace != 0:
return text
if self.title.startswith("Lîste"):
return text
if self.is_liste:
return text
if self.is_disambig or self.gotara_zaravayan:
return text
if self.contains_sitil_cat:
return text
if self.is_sitil == 'lîste' or self.is_sitil is False:
return text
sitil_text = "{{Şitil}}"
text = textlib.add_text(text, sitil_text)
return text
def removeStubTag(self, text: str) -> str:
if self.namespace != 0:
return text
if not self.contains_sitil_cat:
return text
if self.title.startswith("Lîste"):
return text
if self.is_sitil == 'lîste' or self.is_sitil is True:
return text
# Find and remove other templates
template_regex = r'{{\s*([^\}]+\-şitil|[Şş]iti?l|[Kk]urt|[Ss]tub|[Şş]itlek|[^\}]+\-şitil\-[^\}]+)\s*}}'
new_text = re.sub(template_regex, '', text)
# Find and remove other templates
template_sitil_regex = r'{{\s*([Şş]itil-[^\}]+)\s*}}'
new_text = re.sub(template_sitil_regex, '', new_text)
if text != new_text:
mytools.remove_sitil_class(self.current_page)
return new_text
def addUncatTag(self, text: str) -> str:
if self.namespace not in [0, 14]:
return text
if self.namespace == 0 and (self.is_disambig or self.gotara_zaravayan):
return text
contains_bekat_templ = mytools.is_template_in_page(text, mytools.UNCAT_TEMPL)
if contains_bekat_templ or self.is_bekategori in ['idk', False, None]:
return text
print("self.is_bekategori", self.is_bekategori)
category_links = textlib.getCategoryLinks(text, site=self.site)
# ji bo categorize.py
if len(category_links) > 0:
return text
text = "{{Bêkategorî|tarîx=" + self.tarix + "}}\n" + text
return text
def removeUncatTag(self, text: str) -> str:
contains_bekat_templ = mytools.is_template_in_page(text, mytools.UNCAT_TEMPL)
if (self.namespace not in [0, 14] or
not contains_bekat_templ or
self.is_bekategori in ['idk', True, None]):
return text
text = mytools.remove_template(text, mytools.UNCAT_TEMPL)
return text
def fixApostSign(self, text: str) -> str:
if self.namespace != 0:
return text
exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight',
'startspace', 'table', 'ref', 'timeline']
text = textlib.replaceExcept(text, r"'", r"'", exceptions, site=self.site)
return text