Module:User:AmazingJus/af

The following documentation is located at Module:User:AmazingJus/af/documentation. ^[edit] Categories were auto-generated by Module:documentation. ^[edit]

Useful links: root page • root page’s subpages • links • transclusions • testcases • user page • user talk page • userspace

92 of 148 tests failed. (refresh)

test_hyphen:
Text	Expected	Actual
Afrika	A‧fri‧ka	A‧fri‧ka
Afrikaans	A‧fri‧kaans	A‧fri‧kaans
Afrikaner	A‧fri‧ka‧ner	A‧fri‧ka‧ner
Amerikaner	A‧me‧ri‧ka‧ner	A‧me‧ri‧ka‧ner
asyn	a‧syn	a‧syn
belangrik	be‧lang‧rik	be‧lang‧rik
berg	berg	berg
berge	ber‧ge	ber‧ge
berg-reeks	berg‧reeks	berg‧reeks
bos-bedryf	bos‧be‧dryf	bos‧be‧dryf
beskou	be‧skou	bes‧kou
be+ter	be‧ter	be+‧ter
beton	be‧ton	be‧ton
betoon	be‧toon	be‧toon
Botha	Bo‧tha	Bo‧tha
braai	braai	braai
dokumentasie	do‧ku‧men‧ta‧sie	do‧ku‧men‧ta‧sie
eggo	eg‧go	eg‧go
feste	fes‧te	fes‧te
geëet	ge‧eet	geë‧et
gegee	ge‧gee	ge‧gee
ghitaar	ghi‧taar	ghi‧taar
hondjie	hon‧djie	hon‧djie
Johannesburg	Jo‧han‧nes‧burg	Jo‧han‧nes‧burg
karretjie	kar‧re‧tjie	kar‧re‧tjie
klu[b]	klub	klub
Macedonië	Ma‧ce‧do‧ni‧e	Ma‧ce‧do‧nië
'n	'n	'n
onweer	on‧weer	on‧weer
omstandigheid	om‧stan‧dig‧heid	oms‧tan‧di‧gheid
Paraguay	Pa‧ra‧guay	Pa‧ra‧gu‧a‧y
Pretoria	Pre‧to‧ri‧a	Pre‧to‧ri‧a
sjokolade	sjo‧ko‧la‧de	sjo‧ko‧la‧de
s'n	s'n	s'n
spieël	spie‧el	spieël
Suid-Afrika	Suid-‧A‧fri‧ka	Suid‧A‧fri‧ka
vanaand	va‧naand	va‧naand
Venesië	Ve‧ne‧si‧e	Ve‧ne‧sië
vinger	ving‧er	ving‧er
wîe	wî‧e	wîe
zero	ze‧ro	ze‧ro
André	An‧dré	André
Barnard	Bar‧nard	Bar‧nard
Blignaut	Blig‧naut	Blig‧naut
Blignault	Blig‧nault	Blig‧nault
Cilliers	Cil‧liers	Cil‧liers
Coetzee	Coet‧zee	Coet‧zee
Coetzer	Coet‧zer	Coet‧zer
de Villiers	de Vil‧liers	de Vil‧liers
du Plessis	du Ples‧sis	du Ples‧sis
du Preez	du Preez	du Preez
du Toit	du Toit	du Toit
Fouché	Fou‧ché	Fouché
Fourie	Fou‧rie	Fou‧rie
Grové	Gro‧vé	Grové
Jean Pierre	Jean Pierre	Je‧an Pier‧re
Joubert	Jou‧bert	Jou‧bert
La.bus.chag.ne	La‧bus‧chag‧ne	La‧bus‧chag‧ne
La.bu.schagne	La‧bu‧schagne	La‧bu‧s‧chag‧ne
le Gran.ge	le Gran‧ge	le Gran‧ge
le Roux	le Roux	le Roux
Malan	Ma‧lan	Ma‧lan
Malherbe	Mal‧her‧be	Mal‧her‧be
Marais	Ma‧rais	Ma‧rais
Meintjes	Mein‧tjes	Mein‧tjes
Naudé	Nau‧dé	Naudé
Nortje	Nor‧tje	Nor‧tje
Pienaar	Pie‧naar	Pie‧naar
Schalk	Schalk	Schalk
Terblanche	Ter‧blanche	Ter‧blan‧che
Theron	The‧ron	The‧ron
Viljoen	Vil‧joen	Vil‧joen
Visagie	Vi‧sa‧gie	Vi‧sa‧gie
Viviers	Vi‧vi‧ers	Vi‧viers

test_pron:
Text	Expected	Actual
Afrika	ˈɑː.fri.ka	afrika
Afrikaans	ˌa.friˈkɑ̃ːs, ˌa.friˈkɑːns	afrikaans
Afrikaner	ˌa.friˈkɑː.nər	afrikaner
Amerikaner	aˌmɪə̯.riˈkɑː.nər	amerikaner
asyn	aˈsəɪ̯n	asyn
belangrik	bəˈlaŋ.rək	be>langrik
berg	ˈbɛrχ	be>rg
berge	ˈbɛr.ɡə	be>rge
berg-reeks	ˈbɛrχ.rɪə̯ks	be>rg-reeks
bos-bedryf	ˈbɔs.bəˌdrəɪ̯f	bos-be>dryf
beskou	bəˈskœʊ̯	be>skou
be+ter	ˈbɪə̯.tər	be>+ter
beton	bəˈtɔn	be>ton
betoon	bəˈtʊə̯n	be>toon
Botha	ˈbʊə̯.ta	botha
braai	brɑːɪ̯	braai
dokumentasie	ˌdɔ.kju.mɛnˈtɑː.si, ˌdɔ.ky.mɛnˈtɑː.si	dokumentasie
eggo	ˈɛ.χu	eggo
feste	ˈfɛs.tə	feste
geëet	χəˈɪə̯t	ge>ëet
gegee	χəˈχɪə̯	ge>gee
ghitaar	ɡiˈtɑːr	ghitaar
hondjie	ˈɦœi̯ɲ.ci	hondjie
Johannesburg	jʊə̯ˈɦa.nəsˌbœrχ	johannesburg
karretjie	ˈka.rəi̯.ci	karretjie
klu[b]	klab, klœb	klub
Macedonië	ˌma.səˈdʊə̯.ni.ə	macedonië
'n	ə(n)	'n
onweer	ˈɔn.vɪə̯r	onweer
omstandigheid	ɔmˈstan.dəχˌɦəɪ̯t	om>standig<heid
Paraguay	ˈpa.ra.ɡwaɪ̯	paraguay
Pretoria	prəˈtʊə̯.ri.a	pretoria
sjokolade	ˌʃɔ.kɔˈlɑː.də	sjokolade
s'n	sən	s'n
spieël	spiːl	spieël
Suid-Afrika	səɪ̯tˈɑː.fri.ka	suid-afrika
vanaand	fəˈnɑːnt	vanaand
Venesië	vəˈniː.si.ə	venesië
vinger	ˈfəŋ.ər	vinger
wîe	ˈvəː.(ɦ)ə	wîe
zero	ˈzɪə̯.ru	zero
André	ˈan.drəɪ̯	andré
Barnard	ˈbar.nart	barnard
Blignaut	ˈbləχ.nœʊ̯t, ˈbli.nœʊ̯	blignaut
Blignault	ˈbləχ.nœʊ̯t, ˈbli.nœʊ̯	blignault
Cilliers	səlˈjeə̯	cilliers
Coetzee	kutˈseə̯	coetzee
Coetzer	ˈkut.sər	coetzer
de Villiers	də.fəlˈjeə̯	de villiers
du Plessis	dy.pləˈsi	du plessis
du Preez	dəˈpreə̯	du preez
du Toit	dəˈtoːɪ̯	du toit
Fouché	fuˈʃeə̯	fouché
Fourie	fuˈri	fourie
Grové	χruˈveə̯	grové
Jean Pierre	anˈpiːr	jean pierre
Joubert	juˈbæːr	joubert
La.bus.chag.ne	la.busˈkaχ.nə	la.bus.chag.ne
La.bu.schagne	ˈla.bu.ʃəɪ̯n	la.bu.schagne
le Gran.ge	ləˈχran.si	le gran.ge
le Roux	ləˈruː	le roux
Malan	maˈlan, maˈlaŋ	malan
Malherbe	malˈɦɛr.bə	malherbe
Marais	maˈrɛː	marais
Meintjes	məɪ̯ɲˈcis	meintjes
Naudé	nœʊ̯ˈdeə̯	naudé
Nortje	nɔrˈkɪə̯	nortje
Pienaar	ˈpi.nɑːr	pienaar
Schalk	skalk	schalk
Terblanche	tərˈblɑːnʃ	terblanche
Theron	t(ə)ˈron	theron
Viljoen	fəlˈjun	viljoen
Visagie	fəˈsɑː.χi, fəˈsɑː.si	visagie
Viviers	fə.fəˈjeə̯	viviers

--[[
This module implements the template {{af-IPA}}.

Author: AmazingJus

Sources:
- Donaldson, Bruce C. (1993). A Grammar of Afrikaans.
- Wissing, Daan (2016). "Afrikaans phonology". Taalportaal.
--]]

local export = {}

local lang = require("Module:languages").getByCode("af")
local sc = require("Module:scripts").getByCode("Latn")
local hyph = require("Module:hyphenation")
local str = require("Module:string")
local tbl = require("Module:table")

function export.tag_text(text, face)
	return require("Module:script utilities").tag_text(text, lang, sc, face)
end

function export.link(term, face)
	return require("Module:links").full_link( { term = term, lang = lang, sc = sc }, face )
end

local u = require("Module:string/char")
local decomp = mw.ustring.toNFD
local recomp = mw.ustring.toNFC
local lower = mw.ustring.lower

local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local split = mw.text.split
local gsplit = mw.text.gsplit
local sub = mw.ustring.sub

local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.gmatch

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

-- list of constants
local GR = u(0x0300) -- grave
local AC = u(0x0301) -- acute
local CR = u(0x0302) -- circumflex
local DR = u(0x0308) -- diaresis
local accents = GR .. AC .. CR .. DR
local vowels = "aeiouyAEIOUY"
local cons = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQSTVWXZ"
local syll_boundary = "‧#"

-- list of valid trigraphs and digraphs, including diphthongs and long vowels
local graphemes = {
	["aai"] = "ɑːɪ̯",
	["eeu"] = "iʊ̯",
	["ieu"] = "iʊ̯",
	["oei"] = "uɪ̯",
	["ooi"] = "oːɪ̯",
	["aa"] = "ɑː",
	["ae"] = "ɑː",
	["ai"] = "aɪ̯",
	["au"] = "œʊ̯",
	["ee"] = "ɪə̯",
	["ei"] = "əɪ̯",
	["eu"] = "iʊ̯",
	["ie"] = "į", -- temporary value
	["oe"] = "ů", -- temporary value
	["oi"] = "ɔɪ̯",
	["oo"] = "ʊə̯",
	["ou"] = "œʊ̯",
	["ui"] = "uɪ̯",
	["uu"] = "ü" -- temporary value
}
-- sort trigraphs and digraphs in descending order
local graphemes_sorted = {}
for k, _ in pairs(graphemes) do
	table.insert(graphemes_sorted, k)
end
table.sort(graphemes_sorted, function(a, b) return len(a) > len(b) end)

-- list of various grapheme sets
local sets = {
	["vowel_length"] = { -- long-short vowels
		["a"] = {"a", "ɑː"},
		["e"] = {"ɛ", "ɪə̯"},
		["i"] = {"ə", "i"},
		["o"] = {"ɔ", "ʊə̯"},
		["u"] = {"œ", "y"}
	},
	["cons_voice"] = { -- voiced/voiceless consonants
		{"b", "p"},
		{"d", "t"},
		{"ʤ", "ʧ"},
		{"ɡ", "k"},
		{"v", "f"},
		{"z", "s"},
		{"ʒ", "ʃ"},
	}
}

-- list of defined affixes
local affixes = {
	["pref"] = { -- prefixes
		{"aan"},
		{"agter"},
		{"be"},
		{"deur"},
		{"er"},
		{"ge"},
		{"her"},
		{"om"},
		{"ont"},
		{"onder"},
		{"van", pos = "d"},
		{"ver"},
		{"voor"}
	},
	["suf"] = { -- suffixes
		{"agtig"},
		{"baar"},
		{"dom"},
		{"end"},
		{"heid"},
		{"lik"},
		{"loos"},
		{"nis"},
		{"sel"},
		{"skap"},
	}
}

-- list of unstressed words
local unstressed = {
	"die",
	"dit",
	"is",
	"nie",
	"'n"
}

-- list of stressed endings found in loanwords
local stressed_endings = {
	"aal", "aan", "aans", "aar", "aard", "aat", "am", "ant", "at",
	"ee", "eel", "eem", "een", "eer", "ees", "eet", "ein", "ek", "el", -- "-el" only in loanwords
	"ent", "es", "et", "eur", "eus", "eut", "ieel", "ief",
	"iek", "iel", "iem", "ien", "ine", "ier", "iet", "o", -- "-o" only in french loanwords
	"oen", "on", "oof", "oog", "ooi", "ool", "oom", "oon", "oor",
	"teek", "teit", "u", "uum", "uur", "uus", "uut", "y", "yn", "ys"
}

-- list of respelling substitutions
local subs = {
	-- 'N
	{"#'n#", "#ə(n)#", "-"}, -- pronounced /ə(n)/ as the article 'n
	{"'n#", "ən#", "-"}, -- pronounced /ən/ otherwise

	-- CH
	{"ch", "ʃ", "fr"}, -- pronounced /ʃ/ in french loans
	{"sch", "sk", "-"}, -- pronounced /sk/ in the sequence "sch"
	{"ch([" .. cons .. "]?[ei])", "χ%1", "-"}, -- pronounced /χ/ before optional consonant cluster and "e" or "i"
	{"ch", "k", "-"}, -- otherwise /k/

	-- NG
	{"ng", "ŋ", "-"}, -- pronounced /ŋ/

	-- SH/SJ
	{"s[hj]", "ʃ", "-"}, -- pronounced /ʃ/

	-- DJ/TJ
	{"[dt]jie", "kį", "-"}, -- pronounced /-ci/ in the suffix "-djie"/"-tjie"
	{"dj", "ʤ", "-"}, -- "dj" is otherwise /d͡ʒ/
	{"tj", "ʧ", "-"}, -- "tj" is otherwise /t͡ʃ/

	-- C
	{"c([ei])", "s%1", "-"}, -- pronounced /s/ before "e" or "i"
	{"c", "k", "-"}, -- otherwise /k/

	-- GH
	{"gh", "ɡ", "-"}, -- pronounced /ɡ/

	-- G
	{"g", "ɡ", "en"}, -- pronounced /ɡ/ in english loans
	{"r‧ge", "r‧ɡe", "-"}, -- pronounced /ɡ/ between /r/ and /ə/
	{"g", "χ", "-"}, -- otherwise /χ/
	{"n(‧?[kɡ])", "ŋ%1", "-"}, -- /ŋ/ is an allophone of /n/ before /ɡ/ and /k/

	-- V
	{"v", "f", "af"}, -- pronounced /f/ in native words

	-- W
	{"w", "w", "en"}, -- pronounced /w/ in english loans
	{"w", "v", "-"}, -- otherwise /v/

	-- EAU
	{"eaux?", "OU", "fr"}, -- pronounced /œʊ̯/ in french loans

	-- OI
	{"oi", "wA", "fr"}, -- pronounced /wa/ in french loans

	-- IJ
	{"ij([^" .. vowels .. "])", "EI%1", "-"}, -- pronounced /əɪ̯/ in dutch-based names

	-- X
	{"#x", "#s", "-"}, -- pronounced /s/ word-initially
	{"x", "ks", "-"}, -- otherwise /ks/

	-- H
	{"([" .. cons .. vowels .. "])h", "%1", "-"}, -- silent if part of consonant digraph or syllable-final
	{"h", "ɦ", "-"}, -- otherwise /ɦ/

	-- O
	{"o([" .. syll_boundary .. "])", "OU%1", "en"}, -- pronounced /œʊ̯/ in open syllables in english loans
	{"o#", "ů#", "-"}, -- otherwise /u/ in word-final position

	-- U
	{"u([" .. cons .. "])", "A%1", "en"}, -- pronounced /a/ in closed syllables in english loans
	{"u", "jů", "en"}, -- otherwise /ju/ in english loans

	-- Y
	{"y", "j", "en"}, -- pronounced /j/ in english loans
	{"y", "EI", "-"}, -- otherwise /əɪ̯/

	-- circumflex accent
	{CR, "ː", "-"} -- lengthens a vowel with its short quality
}

-- canonicalisation function
local function canonicalise(text)
	-- decompose accents
	text = decomp(text)

	-- make text lowercase
	text = lower(text)

	-- remove extrenous spaces
	text = rsub(text, "%s+", " ")
	text = rsub(text, "^ ", "")
	text = rsub(text, " $", "")

	-- treat commas as a pause
	text = rsub_repeatedly(text, "%s*,%s*", " | ")

	-- return as array of words
	return split(text, " ")
end

-- only apply relevant affixes
local function apply_affix(string, affix, pos, pattern, replacement)
	-- only for no pos restriction or matches
	if (not affix.pos or find(pos, affix.pos)) and find(string, pattern) then
		return rsub(string, pattern, replacement)
	end
	return string
end

-- split components function
local function components(words, etyl, pos)
	-- store modified words
	local result_word = {}
	-- match any pos if no pos provided
	pos = pos or ".*"

	-- loop over every word
	for _, word in ipairs(words) do
		-- for hyphenated affixes, explicitly mark prefix and suffixes using > and < respectively
		for _, prefix in ipairs(affixes.pref) do
			word = apply_affix(word, prefix, pos, prefix[1] .. "%-", prefix[1] .. ">")
		end
		for _, suffix in ipairs(affixes.suf) do
			word = apply_affix(word, suffix, pos, "%-" .. suffix[1], "<" .. suffix[1])
		end

		-- now over every compound
		local result_compound = {}
		for _, compound in ipairs(split(word, "-")) do
			-- add > and < for prefix and suffixes respectively
			for _, prefix in ipairs(affixes.pref) do
				compound = apply_affix(compound, prefix, pos, "^" .. prefix[1], prefix[1] .. ">")
			end
			for _, suffix in ipairs(affixes.suf) do
				compound = apply_affix(compound, suffix, pos, suffix[1] .. "$", "<" .. suffix[1])
			end
			table.insert(result_compound, compound)
		end
		word = table.concat(result_compound, "-")

		table.insert(result_word, word)
	end

	return table.concat(result_word, " ")
end

-- syllabification function (FIXME: work on IPA and syllabification separately
local function syllabify(term, etyl, pos)
	-- remove diaresis and split syllable (note: diaresis shouldn't be displayed in its hyphenation form)
	term = rsub(term, "([" .. vowels .. "])" .. DR, "‧%1")

	-- mark trigraphs and digraphs with curly braces
	for _, graph in ipairs(graphemes_sorted) do
		term = rsub(term, graph, "{" .. graph .. "}")
	end

	-- add dot before consonant + vowel
	term = rsub(term, "([" .. cons .. "]?{?[" .. vowels .. "][" .. accents .. "]?)", "‧%1")

	-- remove any dots inside brackets
	term = rsub(term, "{[^}]*}", function(a) return rsub(a, "‧", "") end)

	-- shift dot before certain consonant clusters and digraphs
	term = rsub(term, "([bcfgkpvw])‧l", "‧%1l") -- clusters with l
	term = rsub(term, "([bcdfgkptwv])‧r", "‧%1r") -- clusters with r
	term = rsub(term, "([dst])‧j", "‧%1j") -- digraphs with j
	term = rsub(term, "([ckgt])‧h", "‧%1h") -- digraphs with h
	term = rsub(term, "n‧g", "ng‧") -- ng is syllable-final
	-- term = rsub(term, ">s‧", ">‧s") -- s can form a cluster after a prefix

	-- remove leading dots and brackets
	term = rsub(term, "#([^" .. vowels .. "]*)‧", "#%1")
	term = rsub(term, "%.", "‧")
	term = rsub(term, "[{}-]", "") -- comment out to debug
	return rsub(term, "‧+", "‧")
end

-- hyphenation function
function export.hyphenation(term, etyl, pos)
	-- get user input as table
	if type(term) == "table" then
		term = term.args[1]
	end

	-- mark all word borders
	term = rsub(term, "([^ ]+)", "#%1#")

	-- format hyphenation
	-- local data = { lang = lang, sc = sc, hyphs = {{hyph = rsub(syllabify(term), "[#%[%]<>]", ""), "%.")}} }

	-- return hyphen.format_hyphenations(data)
	return rsub(recomp(syllabify(term)), "[#%[%]<>]", "")
end

-- generate substitutions function
local function generate_subs(term, etyl, pos)
	local to_sub = {}
	local seen_patterns = {}

	for _, s in ipairs(subs) do
		local s_patt, s_repl, s_etyl = s[1], s[2], s[3]

		-- only add if pattern wasn't added already
		if not seen_patterns[s_patt] then
			-- add substitution for etymology-specific rules
			if etyl ~= "-" and s_etyl == etyl then
				table.insert(to_sub, {s_patt, s_repl})
				seen_patterns[s_patt] = true
			-- otherwise add substitution for default rules
			elseif s_etyl == "-" then
				table.insert(to_sub, {s_patt, s_repl})
				seen_patterns[s_patt] = true
			end
		end
	end

	return to_sub
end

-- stress assignment function
local function stress(term, etyl, pos)
	-- words with certain endings are syllable-final stressed
	for _, ending in ipairs(stressed_endings) do
		if find(term, ending .. "#") then
			if ending == "el" then -- "-el" is only stressed in loanwords
				if not etyl and etyl ~= "af" then
					return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
				else
					break
				end
			elseif ending == "o" then -- "-o" is only stressed in french loanwords
				if etyl == "fr" then
					return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
				else
					break
				end
			else
				return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
			end
		end
	end

	-- add stress mark to first syllable if no ending was stressed
	return rsub(term, "^#", "#ˈ")
end

-- pronunciation function
local function toIPA(text, etyl, pos)
	-- canonicalise term as array of words
	local words = canonicalise(text)

	-- get term as split components
	local term = components(words, etyl, pos)

	-- add stress to term
	-- term = stress(term, etyl, pos)

	-- syllabify term
	-- term = syllabify(term, etyl, pos)

	-- shift stress rightwards to a syllable boundary
	-- term = rsub(term, "([^" .. syll_boundary .. "]*)ˈ", "ˈ%1")

	--[[
	-- prepare table to substitute the appropriate phonemes based on etymology and part of speech
	local to_sub = generate_subs(term, etyl, pos)

	-- go over substitution table
	for _, s in ipairs(to_sub) do
		local k, v = s[1], s[2]
		rsub(term, k, v)
	end

	-- make text lowercase again
	term = lower(term)

	-- substitute graphemes
	for graph, phoneme in pairs(graphemes) do
		term = rsub(term, graph, phoneme)
	end

	-- substitute single-letter vowels
	term = rsub(term, "([aeiou])([‧#ː" .. cons .. "])", function(a, b)
		if match("[‧#]", b) then
			return sets.vowel_length[a][2] .. b -- for open syllables
		else
			return sets.vowel_length[a][1] .. b -- for closed syllables
		end
	end)

	-- replace į, ů, ü with their actual phonetic values
	term = rsub(term, "[įůü]", {["į"] = "i", ["ů"] = "u", ["ü"] = "y"})

	-- remove double consonants
	term = rsub(term, "(.)(‧?)%1", "%2%1")
	]]--

	-- final adjustments
	term = rsub(term, "‧", ".")
	return rsub(term, "[#%[%]]", "")
end

-- main export function
function export.show(term, etyl, pos)
	-- get user input as table
	if type(term) == "table" then
		term = term.args[1]
	end

	return toIPA(term, etyl, pos)
end

return export