--[[
This module implements the template {{af-IPA}}.
Author: AmazingJus
Sources:
- Donaldson, Bruce C. (1993). A Grammar of Afrikaans.
- Wissing, Daan (2016). "Afrikaans phonology". Taalportaal.
--]]
local export = {}
local lang = require("Module:languages").getByCode("af")
local sc = require("Module:scripts").getByCode("Latn")
local hyph = require("Module:hyphenation")
local str = require("Module:string")
local tbl = require("Module:table")
function export.tag_text(text, face)
return require("Module:script utilities").tag_text(text, lang, sc, face)
end
function export.link(term, face)
return require("Module:links").full_link( { term = term, lang = lang, sc = sc }, face )
end
local u = require("Module:string/char")
local decomp = mw.ustring.toNFD
local recomp = mw.ustring.toNFC
local lower = mw.ustring.lower
local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local split = mw.text.split
local gsplit = mw.text.gsplit
local sub = mw.ustring.sub
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.gmatch
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
-- list of constants
local GR = u(0x0300) -- grave
local AC = u(0x0301) -- acute
local CR = u(0x0302) -- circumflex
local DR = u(0x0308) -- diaresis
local accents = GR .. AC .. CR .. DR
local vowels = "aeiouyAEIOUY"
local cons = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQSTVWXZ"
local syll_boundary = "‧#"
-- list of valid trigraphs and digraphs, including diphthongs and long vowels
local graphemes = {
["aai"] = "ɑːɪ̯",
["eeu"] = "iʊ̯",
["ieu"] = "iʊ̯",
["oei"] = "uɪ̯",
["ooi"] = "oːɪ̯",
["aa"] = "ɑː",
["ae"] = "ɑː",
["ai"] = "aɪ̯",
["au"] = "œʊ̯",
["ee"] = "ɪə̯",
["ei"] = "əɪ̯",
["eu"] = "iʊ̯",
["ie"] = "į", -- temporary value
["oe"] = "ů", -- temporary value
["oi"] = "ɔɪ̯",
["oo"] = "ʊə̯",
["ou"] = "œʊ̯",
["ui"] = "uɪ̯",
["uu"] = "ü" -- temporary value
}
-- sort trigraphs and digraphs in descending order
local graphemes_sorted = {}
for k, _ in pairs(graphemes) do
table.insert(graphemes_sorted, k)
end
table.sort(graphemes_sorted, function(a, b) return len(a) > len(b) end)
-- list of various grapheme sets
local sets = {
["vowel_length"] = { -- long-short vowels
["a"] = {"a", "ɑː"},
["e"] = {"ɛ", "ɪə̯"},
["i"] = {"ə", "i"},
["o"] = {"ɔ", "ʊə̯"},
["u"] = {"œ", "y"}
},
["cons_voice"] = { -- voiced/voiceless consonants
{"b", "p"},
{"d", "t"},
{"ʤ", "ʧ"},
{"ɡ", "k"},
{"v", "f"},
{"z", "s"},
{"ʒ", "ʃ"},
}
}
-- list of defined affixes
local affixes = {
["pref"] = { -- prefixes
{"aan"},
{"agter"},
{"be"},
{"deur"},
{"er"},
{"ge"},
{"her"},
{"om"},
{"ont"},
{"onder"},
{"van", pos = "d"},
{"ver"},
{"voor"}
},
["suf"] = { -- suffixes
{"agtig"},
{"baar"},
{"dom"},
{"end"},
{"heid"},
{"lik"},
{"loos"},
{"nis"},
{"sel"},
{"skap"},
}
}
-- list of unstressed words
local unstressed = {
"die",
"dit",
"is",
"nie",
"'n"
}
-- list of stressed endings found in loanwords
local stressed_endings = {
"aal", "aan", "aans", "aar", "aard", "aat", "am", "ant", "at",
"ee", "eel", "eem", "een", "eer", "ees", "eet", "ein", "ek", "el", -- "-el" only in loanwords
"ent", "es", "et", "eur", "eus", "eut", "ieel", "ief",
"iek", "iel", "iem", "ien", "ine", "ier", "iet", "o", -- "-o" only in french loanwords
"oen", "on", "oof", "oog", "ooi", "ool", "oom", "oon", "oor",
"teek", "teit", "u", "uum", "uur", "uus", "uut", "y", "yn", "ys"
}
-- list of respelling substitutions
local subs = {
-- 'N
{"#'n#", "#ə(n)#", "-"}, -- pronounced /ə(n)/ as the article 'n
{"'n#", "ən#", "-"}, -- pronounced /ən/ otherwise
-- CH
{"ch", "ʃ", "fr"}, -- pronounced /ʃ/ in french loans
{"sch", "sk", "-"}, -- pronounced /sk/ in the sequence "sch"
{"ch([" .. cons .. "]?[ei])", "χ%1", "-"}, -- pronounced /χ/ before optional consonant cluster and "e" or "i"
{"ch", "k", "-"}, -- otherwise /k/
-- NG
{"ng", "ŋ", "-"}, -- pronounced /ŋ/
-- SH/SJ
{"s[hj]", "ʃ", "-"}, -- pronounced /ʃ/
-- DJ/TJ
{"[dt]jie", "kį", "-"}, -- pronounced /-ci/ in the suffix "-djie"/"-tjie"
{"dj", "ʤ", "-"}, -- "dj" is otherwise /d͡ʒ/
{"tj", "ʧ", "-"}, -- "tj" is otherwise /t͡ʃ/
-- C
{"c([ei])", "s%1", "-"}, -- pronounced /s/ before "e" or "i"
{"c", "k", "-"}, -- otherwise /k/
-- GH
{"gh", "ɡ", "-"}, -- pronounced /ɡ/
-- G
{"g", "ɡ", "en"}, -- pronounced /ɡ/ in english loans
{"r‧ge", "r‧ɡe", "-"}, -- pronounced /ɡ/ between /r/ and /ə/
{"g", "χ", "-"}, -- otherwise /χ/
{"n(‧?[kɡ])", "ŋ%1", "-"}, -- /ŋ/ is an allophone of /n/ before /ɡ/ and /k/
-- V
{"v", "f", "af"}, -- pronounced /f/ in native words
-- W
{"w", "w", "en"}, -- pronounced /w/ in english loans
{"w", "v", "-"}, -- otherwise /v/
-- EAU
{"eaux?", "OU", "fr"}, -- pronounced /œʊ̯/ in french loans
-- OI
{"oi", "wA", "fr"}, -- pronounced /wa/ in french loans
-- IJ
{"ij([^" .. vowels .. "])", "EI%1", "-"}, -- pronounced /əɪ̯/ in dutch-based names
-- X
{"#x", "#s", "-"}, -- pronounced /s/ word-initially
{"x", "ks", "-"}, -- otherwise /ks/
-- H
{"([" .. cons .. vowels .. "])h", "%1", "-"}, -- silent if part of consonant digraph or syllable-final
{"h", "ɦ", "-"}, -- otherwise /ɦ/
-- O
{"o([" .. syll_boundary .. "])", "OU%1", "en"}, -- pronounced /œʊ̯/ in open syllables in english loans
{"o#", "ů#", "-"}, -- otherwise /u/ in word-final position
-- U
{"u([" .. cons .. "])", "A%1", "en"}, -- pronounced /a/ in closed syllables in english loans
{"u", "jů", "en"}, -- otherwise /ju/ in english loans
-- Y
{"y", "j", "en"}, -- pronounced /j/ in english loans
{"y", "EI", "-"}, -- otherwise /əɪ̯/
-- circumflex accent
{CR, "ː", "-"} -- lengthens a vowel with its short quality
}
-- canonicalisation function
local function canonicalise(text)
-- decompose accents
text = decomp(text)
-- make text lowercase
text = lower(text)
-- remove extrenous spaces
text = rsub(text, "%s+", " ")
text = rsub(text, "^ ", "")
text = rsub(text, " $", "")
-- treat commas as a pause
text = rsub_repeatedly(text, "%s*,%s*", " | ")
-- return as array of words
return split(text, " ")
end
-- only apply relevant affixes
local function apply_affix(string, affix, pos, pattern, replacement)
-- only for no pos restriction or matches
if (not affix.pos or find(pos, affix.pos)) and find(string, pattern) then
return rsub(string, pattern, replacement)
end
return string
end
-- split components function
local function components(words, etyl, pos)
-- store modified words
local result_word = {}
-- match any pos if no pos provided
pos = pos or ".*"
-- loop over every word
for _, word in ipairs(words) do
-- for hyphenated affixes, explicitly mark prefix and suffixes using > and < respectively
for _, prefix in ipairs(affixes.pref) do
word = apply_affix(word, prefix, pos, prefix[1] .. "%-", prefix[1] .. ">")
end
for _, suffix in ipairs(affixes.suf) do
word = apply_affix(word, suffix, pos, "%-" .. suffix[1], "<" .. suffix[1])
end
-- now over every compound
local result_compound = {}
for _, compound in ipairs(split(word, "-")) do
-- add > and < for prefix and suffixes respectively
for _, prefix in ipairs(affixes.pref) do
compound = apply_affix(compound, prefix, pos, "^" .. prefix[1], prefix[1] .. ">")
end
for _, suffix in ipairs(affixes.suf) do
compound = apply_affix(compound, suffix, pos, suffix[1] .. "$", "<" .. suffix[1])
end
table.insert(result_compound, compound)
end
word = table.concat(result_compound, "-")
table.insert(result_word, word)
end
return table.concat(result_word, " ")
end
-- syllabification function (FIXME: work on IPA and syllabification separately
local function syllabify(term, etyl, pos)
-- remove diaresis and split syllable (note: diaresis shouldn't be displayed in its hyphenation form)
term = rsub(term, "([" .. vowels .. "])" .. DR, "‧%1")
-- mark trigraphs and digraphs with curly braces
for _, graph in ipairs(graphemes_sorted) do
term = rsub(term, graph, "{" .. graph .. "}")
end
-- add dot before consonant + vowel
term = rsub(term, "([" .. cons .. "]?{?[" .. vowels .. "][" .. accents .. "]?)", "‧%1")
-- remove any dots inside brackets
term = rsub(term, "{[^}]*}", function(a) return rsub(a, "‧", "") end)
-- shift dot before certain consonant clusters and digraphs
term = rsub(term, "([bcfgkpvw])‧l", "‧%1l") -- clusters with l
term = rsub(term, "([bcdfgkptwv])‧r", "‧%1r") -- clusters with r
term = rsub(term, "([dst])‧j", "‧%1j") -- digraphs with j
term = rsub(term, "([ckgt])‧h", "‧%1h") -- digraphs with h
term = rsub(term, "n‧g", "ng‧") -- ng is syllable-final
-- term = rsub(term, ">s‧", ">‧s") -- s can form a cluster after a prefix
-- remove leading dots and brackets
term = rsub(term, "#([^" .. vowels .. "]*)‧", "#%1")
term = rsub(term, "%.", "‧")
term = rsub(term, "[{}-]", "") -- comment out to debug
return rsub(term, "‧+", "‧")
end
-- hyphenation function
function export.hyphenation(term, etyl, pos)
-- get user input as table
if type(term) == "table" then
term = term.args[1]
end
-- mark all word borders
term = rsub(term, "([^ ]+)", "#%1#")
-- format hyphenation
-- local data = { lang = lang, sc = sc, hyphs = {{hyph = rsub(syllabify(term), "[#%[%]<>]", ""), "%.")}} }
-- return hyphen.format_hyphenations(data)
return rsub(recomp(syllabify(term)), "[#%[%]<>]", "")
end
-- generate substitutions function
local function generate_subs(term, etyl, pos)
local to_sub = {}
local seen_patterns = {}
for _, s in ipairs(subs) do
local s_patt, s_repl, s_etyl = s[1], s[2], s[3]
-- only add if pattern wasn't added already
if not seen_patterns[s_patt] then
-- add substitution for etymology-specific rules
if etyl ~= "-" and s_etyl == etyl then
table.insert(to_sub, {s_patt, s_repl})
seen_patterns[s_patt] = true
-- otherwise add substitution for default rules
elseif s_etyl == "-" then
table.insert(to_sub, {s_patt, s_repl})
seen_patterns[s_patt] = true
end
end
end
return to_sub
end
-- stress assignment function
local function stress(term, etyl, pos)
-- words with certain endings are syllable-final stressed
for _, ending in ipairs(stressed_endings) do
if find(term, ending .. "#") then
if ending == "el" then -- "-el" is only stressed in loanwords
if not etyl and etyl ~= "af" then
return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
else
break
end
elseif ending == "o" then -- "-o" is only stressed in french loanwords
if etyl == "fr" then
return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
else
break
end
else
return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
end
end
end
-- add stress mark to first syllable if no ending was stressed
return rsub(term, "^#", "#ˈ")
end
-- pronunciation function
local function toIPA(text, etyl, pos)
-- canonicalise term as array of words
local words = canonicalise(text)
-- get term as split components
local term = components(words, etyl, pos)
-- add stress to term
-- term = stress(term, etyl, pos)
-- syllabify term
-- term = syllabify(term, etyl, pos)
-- shift stress rightwards to a syllable boundary
-- term = rsub(term, "([^" .. syll_boundary .. "]*)ˈ", "ˈ%1")
--[[
-- prepare table to substitute the appropriate phonemes based on etymology and part of speech
local to_sub = generate_subs(term, etyl, pos)
-- go over substitution table
for _, s in ipairs(to_sub) do
local k, v = s[1], s[2]
rsub(term, k, v)
end
-- make text lowercase again
term = lower(term)
-- substitute graphemes
for graph, phoneme in pairs(graphemes) do
term = rsub(term, graph, phoneme)
end
-- substitute single-letter vowels
term = rsub(term, "([aeiou])([‧#ː" .. cons .. "])", function(a, b)
if match("[‧#]", b) then
return sets.vowel_length[a][2] .. b -- for open syllables
else
return sets.vowel_length[a][1] .. b -- for closed syllables
end
end)
-- replace į, ů, ü with their actual phonetic values
term = rsub(term, "[įůü]", {["į"] = "i", ["ů"] = "u", ["ü"] = "y"})
-- remove double consonants
term = rsub(term, "(.)(‧?)%1", "%2%1")
]]--
-- final adjustments
term = rsub(term, "‧", ".")
return rsub(term, "[#%[%]]", "")
end
-- main export function
function export.show(term, etyl, pos)
-- get user input as table
if type(term) == "table" then
term = term.args[1]
end
return toIPA(term, etyl, pos)
end
return export