Module:User:AmazingJus/af

Archived revision by AmazingJus (talk | contribs) as of 03:06, 27 November 2024.

92 of 148 tests failed. (refresh)

TextExpectedActualComments
test_hyphen:
PassedAfrikaA‧fri‧kaA‧fri‧ka
PassedAfrikaansA‧fri‧kaansA‧fri‧kaans
PassedAfrikanerA‧fri‧ka‧nerA‧fri‧ka‧ner
PassedAmerikanerA‧me‧ri‧ka‧nerA‧me‧ri‧ka‧ner
Passedasyna‧syna‧syn
Passedbelangrikbe‧lang‧rikbe‧lang‧rik
Passedbergbergberg
Passedbergeber‧geber‧ge
Passedberg-reeksberg‧reeksberg‧reeks
Passedbos-bedryfbos‧be‧dryfbos‧be‧dryf
Failedbeskoube‧skoubes‧kou
Failedbe+terbe‧terbe+‧ter
Passedbetonbe‧tonbe‧ton
Passedbetoonbe‧toonbe‧toon
PassedBothaBo‧thaBo‧tha
Passedbraaibraaibraai
Passeddokumentasiedo‧ku‧men‧ta‧siedo‧ku‧men‧ta‧sie
Passedeggoeg‧goeg‧go
Passedfestefes‧tefes‧te
Failedgeëetge‧eetgeë‧et
Passedgegeege‧geege‧gee
Passedghitaarghi‧taarghi‧taar
Passedhondjiehon‧djiehon‧djie
PassedJohannesburgJo‧han‧nes‧burgJo‧han‧nes‧burg
Passedkarretjiekar‧re‧tjiekar‧re‧tjie
Passedklu[b]klubklub
FailedMacedoniëMa‧ce‧do‧ni‧eMa‧ce‧do‧nië
Passed'n'n'n
Passedonweeron‧weeron‧weer
Failedomstandigheidom‧stan‧dig‧heidoms‧tan‧di‧gheid
FailedParaguayPa‧ra‧guayPa‧ra‧gu‧a‧y
PassedPretoriaPre‧to‧ri‧aPre‧to‧ri‧a
Passedsjokoladesjo‧ko‧la‧desjo‧ko‧la‧de
Passeds'ns'ns'n
Failedspieëlspie‧elspieël
FailedSuid-AfrikaSuid-‧A‧fri‧kaSuid‧A‧fri‧ka
Passedvanaandva‧naandva‧naand
FailedVenesiëVe‧ne‧si‧eVe‧ne‧sië
Passedvingerving‧erving‧er
Failedwîewî‧ewîe
Passedzeroze‧roze‧ro
FailedAndréAn‧dréAndré
PassedBarnardBar‧nardBar‧nard
PassedBlignautBlig‧nautBlig‧naut
PassedBlignaultBlig‧naultBlig‧nault
PassedCilliersCil‧liersCil‧liers
PassedCoetzeeCoet‧zeeCoet‧zee
PassedCoetzerCoet‧zerCoet‧zer
Passedde Villiersde Vil‧liersde Vil‧liers
Passeddu Plessisdu Ples‧sisdu Ples‧sis
Passeddu Preezdu Preezdu Preez
Passeddu Toitdu Toitdu Toit
FailedFouchéFou‧chéFouché
PassedFourieFou‧rieFou‧rie
FailedGrovéGro‧véGrové
FailedJean PierreJean PierreJe‧an Pier‧re
PassedJoubertJou‧bertJou‧bert
PassedLa.bus.chag.neLa‧bus‧chag‧neLa‧bus‧chag‧ne
FailedLa.bu.schagneLa‧bu‧schagneLa‧bu‧s‧chag‧ne
Passedle Gran.gele Gran‧gele Gran‧ge
Passedle Rouxle Rouxle Roux
PassedMalanMa‧lanMa‧lan
PassedMalherbeMal‧her‧beMal‧her‧be
PassedMaraisMa‧raisMa‧rais
PassedMeintjesMein‧tjesMein‧tjes
FailedNaudéNau‧déNaudé
PassedNortjeNor‧tjeNor‧tje
PassedPienaarPie‧naarPie‧naar
PassedSchalkSchalkSchalk
FailedTerblancheTer‧blancheTer‧blan‧che
PassedTheronThe‧ronThe‧ron
PassedViljoenVil‧joenVil‧joen
PassedVisagieVi‧sa‧gieVi‧sa‧gie
FailedViviersVi‧vi‧ersVi‧viers
TextExpectedActualComments
test_pron:
FailedAfrikaˈɑː.fri.kaafrika
FailedAfrikaansˌa.friˈkɑ̃ːs, ˌa.friˈkɑːnsafrikaans
FailedAfrikanerˌa.friˈkɑː.nərafrikaner
FailedAmerikaneraˌmɪə̯.riˈkɑː.nəramerikaner
Failedasynaˈsəɪ̯nasyn
Failedbelangrikbəˈlaŋ.rəkbe>langrik
Failedbergˈbɛrχbe>rg
Failedbergeˈbɛr.ɡəbe>rge
Failedberg-reeksˈbɛrχ.rɪə̯ksbe>rg-reeks
Failedbos-bedryfˈbɔs.bəˌdrəɪ̯fbos-be>dryf
Failedbeskoubəˈskœʊ̯be>skou
Failedbe+terˈbɪə̯.tərbe>+ter
Failedbetonbəˈtɔnbe>ton
Failedbetoonbəˈtʊə̯nbe>toon
FailedBothaˈbʊə̯.tabotha
Failedbraaibrɑːɪ̯braai
Faileddokumentasieˌdɔ.kju.mɛnˈtɑː.si, ˌdɔ.ky.mɛnˈtɑː.sidokumentasie
Failedeggoˈɛ.χueggo
Failedfesteˈfɛs.təfeste
Failedgeëetχəˈɪə̯tge>ëet
Failedgegeeχəˈχɪə̯ge>gee
Failedghitaarɡiˈtɑːrghitaar
Failedhondjieˈɦœi̯ɲ.cihondjie
FailedJohannesburgjʊə̯ˈɦa.nəsˌbœrχjohannesburg
Failedkarretjieˈka.rəi̯.cikarretjie
Failedklu[b]klab, klœbklub
FailedMacedoniëˌma.səˈdʊə̯.ni.əmacedonië
Failed'nə(n)'n
Failedonweerˈɔn.vɪə̯ronweer
Failedomstandigheidɔmˈstan.dəχˌɦəɪ̯tom>standig<heid
FailedParaguayˈpa.ra.ɡwaɪ̯paraguay
FailedPretoriaprəˈtʊə̯.ri.apretoria
Failedsjokoladeˌʃɔ.kɔˈlɑː.dəsjokolade
Faileds'nsəns'n
Failedspieëlspiːlspieël
FailedSuid-Afrikasəɪ̯tˈɑː.fri.kasuid-afrika
Failedvanaandfəˈnɑːntvanaand
FailedVenesiëvəˈniː.si.əvenesië
Failedvingerˈfəŋ.ərvinger
Failedwîeˈvəː.(ɦ)əwîe
Failedzeroˈzɪə̯.ruzero
FailedAndréˈan.drəɪ̯andré
FailedBarnardˈbar.nartbarnard
FailedBlignautˈbləχ.nœʊ̯t, ˈbli.nœʊ̯blignaut
FailedBlignaultˈbləχ.nœʊ̯t, ˈbli.nœʊ̯blignault
FailedCillierssəlˈjeə̯cilliers
FailedCoetzeekutˈseə̯coetzee
FailedCoetzerˈkut.sərcoetzer
Failedde Villiersdə.fəlˈjeə̯de villiers
Faileddu Plessisdy.pləˈsidu plessis
Faileddu Preezdəˈpreə̯du preez
Faileddu Toitdəˈtoːɪ̯du toit
FailedFouchéfuˈʃeə̯fouché
FailedFouriefuˈrifourie
FailedGrovéχruˈveə̯grové
FailedJean Pierreanˈpiːrjean pierre
FailedJoubertjuˈbæːrjoubert
FailedLa.bus.chag.nela.busˈkaχ.nəla.bus.chag.ne
FailedLa.bu.schagneˈla.bu.ʃəɪ̯nla.bu.schagne
Failedle Gran.geləˈχran.sile gran.ge
Failedle Rouxləˈruːle roux
FailedMalanmaˈlan, maˈlaŋmalan
FailedMalherbemalˈɦɛr.bəmalherbe
FailedMaraismaˈrɛːmarais
FailedMeintjesməɪ̯ɲˈcismeintjes
FailedNaudénœʊ̯ˈdeə̯naudé
FailedNortjenɔrˈkɪə̯nortje
FailedPienaarˈpi.nɑːrpienaar
FailedSchalkskalkschalk
FailedTerblanchetərˈblɑːnʃterblanche
FailedTheront(ə)ˈrontheron
FailedViljoenfəlˈjunviljoen
FailedVisagiefəˈsɑː.χi, fəˈsɑː.sivisagie
FailedViviersfə.fəˈjeə̯viviers

--[[
This module implements the template {{af-IPA}}.

Author: AmazingJus

Sources:
- Donaldson, Bruce C. (1993). A Grammar of Afrikaans.
- Wissing, Daan (2016). "Afrikaans phonology". Taalportaal.
--]]

local export = {}

local lang = require("Module:languages").getByCode("af")
local sc = require("Module:scripts").getByCode("Latn")
local hyph = require("Module:hyphenation")
local str = require("Module:string")
local tbl = require("Module:table")

function export.tag_text(text, face)
	return require("Module:script utilities").tag_text(text, lang, sc, face)
end

function export.link(term, face)
	return require("Module:links").full_link( { term = term, lang = lang, sc = sc }, face )
end

local u = require("Module:string/char")
local decomp = mw.ustring.toNFD
local recomp = mw.ustring.toNFC
local lower = mw.ustring.lower

local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local split = mw.text.split
local gsplit = mw.text.gsplit
local sub = mw.ustring.sub

local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.gmatch

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

-- list of constants
local GR = u(0x0300) -- grave
local AC = u(0x0301) -- acute
local CR = u(0x0302) -- circumflex
local DR = u(0x0308) -- diaresis
local accents = GR .. AC .. CR .. DR
local vowels = "aeiouyAEIOUY"
local cons = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQSTVWXZ"
local syll_boundary = "‧#"

-- list of valid trigraphs and digraphs, including diphthongs and long vowels
local graphemes = {
	["aai"] = "ɑːɪ̯",
	["eeu"] = "iʊ̯",
	["ieu"] = "iʊ̯",
	["oei"] = "uɪ̯",
	["ooi"] = "oːɪ̯",
	["aa"] = "ɑː",
	["ae"] = "ɑː",
	["ai"] = "aɪ̯",
	["au"] = "œʊ̯",
	["ee"] = "ɪə̯",
	["ei"] = "əɪ̯",
	["eu"] = "iʊ̯",
	["ie"] = "į", -- temporary value
	["oe"] = "ů", -- temporary value
	["oi"] = "ɔɪ̯",
	["oo"] = "ʊə̯",
	["ou"] = "œʊ̯",
	["ui"] = "uɪ̯",
	["uu"] = "ü" -- temporary value
}
-- sort trigraphs and digraphs in descending order
local graphemes_sorted = {}
for k, _ in pairs(graphemes) do
	table.insert(graphemes_sorted, k)
end
table.sort(graphemes_sorted, function(a, b) return len(a) > len(b) end)

-- list of various grapheme sets
local sets = {
	["vowel_length"] = { -- long-short vowels
		["a"] = {"a", "ɑː"},
		["e"] = {"ɛ", "ɪə̯"},
		["i"] = {"ə", "i"},
		["o"] = {"ɔ", "ʊə̯"},
		["u"] = {"œ", "y"}
	},
	["cons_voice"] = { -- voiced/voiceless consonants
		{"b", "p"},
		{"d", "t"},
		{"ʤ", "ʧ"},
		{"ɡ", "k"},
		{"v", "f"},
		{"z", "s"},
		{"ʒ", "ʃ"},
	}
}

-- list of defined affixes
local affixes = {
	["pref"] = { -- prefixes
		{"aan"},
		{"agter"},
		{"be"},
		{"deur"},
		{"er"},
		{"ge"},
		{"her"},
		{"om"},
		{"ont"},
		{"onder"},
		{"van", pos = "d"},
		{"ver"},
		{"voor"}
	},
	["suf"] = { -- suffixes
		{"agtig"},
		{"baar"},
		{"dom"},
		{"end"},
		{"heid"},
		{"lik"},
		{"loos"},
		{"nis"},
		{"sel"},
		{"skap"},
	}
}

-- list of unstressed words
local unstressed = {
	"die",
	"dit",
	"is",
	"nie",
	"'n"
}

-- list of stressed endings found in loanwords
local stressed_endings = {
	"aal", "aan", "aans", "aar", "aard", "aat", "am", "ant", "at",
	"ee", "eel", "eem", "een", "eer", "ees", "eet", "ein", "ek", "el", -- "-el" only in loanwords
	"ent", "es", "et", "eur", "eus", "eut", "ieel", "ief",
	"iek", "iel", "iem", "ien", "ine", "ier", "iet", "o", -- "-o" only in french loanwords
	"oen", "on", "oof", "oog", "ooi", "ool", "oom", "oon", "oor",
	"teek", "teit", "u", "uum", "uur", "uus", "uut", "y", "yn", "ys"
}

-- list of respelling substitutions
local subs = {
	-- 'N
	{"#'n#", "#ə(n)#", "-"}, -- pronounced /ə(n)/ as the article 'n
	{"'n#", "ən#", "-"}, -- pronounced /ən/ otherwise

	-- CH
	{"ch", "ʃ", "fr"}, -- pronounced /ʃ/ in french loans
	{"sch", "sk", "-"}, -- pronounced /sk/ in the sequence "sch"
	{"ch([" .. cons .. "]?[ei])", "χ%1", "-"}, -- pronounced /χ/ before optional consonant cluster and "e" or "i"
	{"ch", "k", "-"}, -- otherwise /k/

	-- NG
	{"ng", "ŋ", "-"}, -- pronounced /ŋ/

	-- SH/SJ
	{"s[hj]", "ʃ", "-"}, -- pronounced /ʃ/

	-- DJ/TJ
	{"[dt]jie", "kį", "-"}, -- pronounced /-ci/ in the suffix "-djie"/"-tjie"
	{"dj", "ʤ", "-"}, -- "dj" is otherwise /d͡ʒ/
	{"tj", "ʧ", "-"}, -- "tj" is otherwise /t͡ʃ/

	-- C
	{"c([ei])", "s%1", "-"}, -- pronounced /s/ before "e" or "i"
	{"c", "k", "-"}, -- otherwise /k/

	-- GH
	{"gh", "ɡ", "-"}, -- pronounced /ɡ/

	-- G
	{"g", "ɡ", "en"}, -- pronounced /ɡ/ in english loans
	{"r‧ge", "r‧ɡe", "-"}, -- pronounced /ɡ/ between /r/ and /ə/
	{"g", "χ", "-"}, -- otherwise /χ/
	{"n(‧?[kɡ])", "ŋ%1", "-"}, -- /ŋ/ is an allophone of /n/ before /ɡ/ and /k/

	-- V
	{"v", "f", "af"}, -- pronounced /f/ in native words

	-- W
	{"w", "w", "en"}, -- pronounced /w/ in english loans
	{"w", "v", "-"}, -- otherwise /v/

	-- EAU
	{"eaux?", "OU", "fr"}, -- pronounced /œʊ̯/ in french loans

	-- OI
	{"oi", "wA", "fr"}, -- pronounced /wa/ in french loans

	-- IJ
	{"ij([^" .. vowels .. "])", "EI%1", "-"}, -- pronounced /əɪ̯/ in dutch-based names

	-- X
	{"#x", "#s", "-"}, -- pronounced /s/ word-initially
	{"x", "ks", "-"}, -- otherwise /ks/

	-- H
	{"([" .. cons .. vowels .. "])h", "%1", "-"}, -- silent if part of consonant digraph or syllable-final
	{"h", "ɦ", "-"}, -- otherwise /ɦ/

	-- O
	{"o([" .. syll_boundary .. "])", "OU%1", "en"}, -- pronounced /œʊ̯/ in open syllables in english loans
	{"o#", "ů#", "-"}, -- otherwise /u/ in word-final position

	-- U
	{"u([" .. cons .. "])", "A%1", "en"}, -- pronounced /a/ in closed syllables in english loans
	{"u", "jů", "en"}, -- otherwise /ju/ in english loans

	-- Y
	{"y", "j", "en"}, -- pronounced /j/ in english loans
	{"y", "EI", "-"}, -- otherwise /əɪ̯/

	-- circumflex accent
	{CR, "ː", "-"} -- lengthens a vowel with its short quality
}

-- canonicalisation function
local function canonicalise(text)
	-- decompose accents
	text = decomp(text)

	-- make text lowercase
	text = lower(text)

	-- remove extrenous spaces
	text = rsub(text, "%s+", " ")
	text = rsub(text, "^ ", "")
	text = rsub(text, " $", "")

	-- treat commas as a pause
	text = rsub_repeatedly(text, "%s*,%s*", " | ")

	-- return as array of words
	return split(text, " ")
end

-- only apply relevant affixes
local function apply_affix(string, affix, pos, pattern, replacement)
	-- only for no pos restriction or matches
	if (not affix.pos or find(pos, affix.pos)) and find(string, pattern) then
		return rsub(string, pattern, replacement)
	end
	return string
end

-- split components function
local function components(words, etyl, pos)
	-- store modified words
	local result_word = {}
	-- match any pos if no pos provided
	pos = pos or ".*"

	-- loop over every word
	for _, word in ipairs(words) do
		-- for hyphenated affixes, explicitly mark prefix and suffixes using > and < respectively
		for _, prefix in ipairs(affixes.pref) do
			word = apply_affix(word, prefix, pos, prefix[1] .. "%-", prefix[1] .. ">")
		end
		for _, suffix in ipairs(affixes.suf) do
			word = apply_affix(word, suffix, pos, "%-" .. suffix[1], "<" .. suffix[1])
		end

		-- now over every compound
		local result_compound = {}
		for _, compound in ipairs(split(word, "-")) do
			-- add > and < for prefix and suffixes respectively
			for _, prefix in ipairs(affixes.pref) do
				compound = apply_affix(compound, prefix, pos, "^" .. prefix[1], prefix[1] .. ">")
			end
			for _, suffix in ipairs(affixes.suf) do
				compound = apply_affix(compound, suffix, pos, suffix[1] .. "$", "<" .. suffix[1])
			end
			table.insert(result_compound, compound)
		end
		word = table.concat(result_compound, "-")

		table.insert(result_word, word)
	end

	return table.concat(result_word, " ")
end

-- syllabification function (FIXME: work on IPA and syllabification separately
local function syllabify(term, etyl, pos)
	-- remove diaresis and split syllable (note: diaresis shouldn't be displayed in its hyphenation form)
	term = rsub(term, "([" .. vowels .. "])" .. DR, "‧%1")

	-- mark trigraphs and digraphs with curly braces
	for _, graph in ipairs(graphemes_sorted) do
		term = rsub(term, graph, "{" .. graph .. "}")
	end

	-- add dot before consonant + vowel
	term = rsub(term, "([" .. cons .. "]?{?[" .. vowels .. "][" .. accents .. "]?)", "‧%1")

	-- remove any dots inside brackets
	term = rsub(term, "{[^}]*}", function(a) return rsub(a, "‧", "") end)

	-- shift dot before certain consonant clusters and digraphs
	term = rsub(term, "([bcfgkpvw])‧l", "‧%1l") -- clusters with l
	term = rsub(term, "([bcdfgkptwv])‧r", "‧%1r") -- clusters with r
	term = rsub(term, "([dst])‧j", "‧%1j") -- digraphs with j
	term = rsub(term, "([ckgt])‧h", "‧%1h") -- digraphs with h
	term = rsub(term, "n‧g", "ng‧") -- ng is syllable-final
	-- term = rsub(term, ">s‧", ">‧s") -- s can form a cluster after a prefix

	-- remove leading dots and brackets
	term = rsub(term, "#([^" .. vowels .. "]*)‧", "#%1")
	term = rsub(term, "%.", "‧")
	term = rsub(term, "[{}-]", "") -- comment out to debug
	return rsub(term, "‧+", "‧")
end

-- hyphenation function
function export.hyphenation(term, etyl, pos)
	-- get user input as table
	if type(term) == "table" then
		term = term.args[1]
	end

	-- mark all word borders
	term = rsub(term, "([^ ]+)", "#%1#")

	-- format hyphenation
	-- local data = { lang = lang, sc = sc, hyphs = {{hyph = rsub(syllabify(term), "[#%[%]<>]", ""), "%.")}} }

	-- return hyphen.format_hyphenations(data)
	return rsub(recomp(syllabify(term)), "[#%[%]<>]", "")
end

-- generate substitutions function
local function generate_subs(term, etyl, pos)
	local to_sub = {}
	local seen_patterns = {}

	for _, s in ipairs(subs) do
		local s_patt, s_repl, s_etyl = s[1], s[2], s[3]

		-- only add if pattern wasn't added already
		if not seen_patterns[s_patt] then
			-- add substitution for etymology-specific rules
			if etyl ~= "-" and s_etyl == etyl then
				table.insert(to_sub, {s_patt, s_repl})
				seen_patterns[s_patt] = true
			-- otherwise add substitution for default rules
			elseif s_etyl == "-" then
				table.insert(to_sub, {s_patt, s_repl})
				seen_patterns[s_patt] = true
			end
		end
	end

	return to_sub
end

-- stress assignment function
local function stress(term, etyl, pos)
	-- words with certain endings are syllable-final stressed
	for _, ending in ipairs(stressed_endings) do
		if find(term, ending .. "#") then
			if ending == "el" then -- "-el" is only stressed in loanwords
				if not etyl and etyl ~= "af" then
					return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
				else
					break
				end
			elseif ending == "o" then -- "-o" is only stressed in french loanwords
				if etyl == "fr" then
					return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
				else
					break
				end
			else
				return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
			end
		end
	end

	-- add stress mark to first syllable if no ending was stressed
	return rsub(term, "^#", "#ˈ")
end

-- pronunciation function
local function toIPA(text, etyl, pos)
	-- canonicalise term as array of words
	local words = canonicalise(text)

	-- get term as split components
	local term = components(words, etyl, pos)

	-- add stress to term
	-- term = stress(term, etyl, pos)

	-- syllabify term
	-- term = syllabify(term, etyl, pos)

	-- shift stress rightwards to a syllable boundary
	-- term = rsub(term, "([^" .. syll_boundary .. "]*)ˈ", "ˈ%1")

	--[[
	-- prepare table to substitute the appropriate phonemes based on etymology and part of speech
	local to_sub = generate_subs(term, etyl, pos)

	-- go over substitution table
	for _, s in ipairs(to_sub) do
		local k, v = s[1], s[2]
		rsub(term, k, v)
	end

	-- make text lowercase again
	term = lower(term)

	-- substitute graphemes
	for graph, phoneme in pairs(graphemes) do
		term = rsub(term, graph, phoneme)
	end

	-- substitute single-letter vowels
	term = rsub(term, "([aeiou])([‧#ː" .. cons .. "])", function(a, b)
		if match("[‧#]", b) then
			return sets.vowel_length[a][2] .. b -- for open syllables
		else
			return sets.vowel_length[a][1] .. b -- for closed syllables
		end
	end)

	-- replace į, ů, ü with their actual phonetic values
	term = rsub(term, "[įůü]", {["į"] = "i", ["ů"] = "u", ["ü"] = "y"})

	-- remove double consonants
	term = rsub(term, "(.)(‧?)%1", "%2%1")
	]]--

	-- final adjustments
	term = rsub(term, "‧", ".")
	return rsub(term, "[#%[%]]", "")
end

-- main export function
function export.show(term, etyl, pos)
	-- get user input as table
	if type(term) == "table" then
		term = term.args[1]
	end

	return toIPA(term, etyl, pos)
end

return export