--[[
This module implements the template {{af-IPA}}.
Author: AmazingJus
Sources:
- Donaldson, Bruce C. (1993). A Grammar of Afrikaans.
- Wissing, Daan (2016). "Afrikaans phonology". Taalportaal.
--]]
local export = {}
local lang = require("Module:languages").getByCode("af")
local sc = require("Module:scripts").getByCode("Latn")
local hyph = require("Module:hyphenation")
local str = require("Module:string")
local tbl = require("Module:table")
function export.tag_text(text, face)
return require("Module:script utilities").tag_text(text, lang, sc, face)
end
function export.link(term, face)
return require("Module:links").full_link( { term = term, lang = lang, sc = sc }, face )
end
local u = require("Module:string/char")
local decomp = mw.ustring.toNFD
local recomp = mw.ustring.toNFC
local lower = mw.ustring.lower
local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local split = mw.text.split
local gsplit = mw.text.gsplit
local sub = mw.ustring.sub
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.gmatch
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
-- list of constants
local grave = u(0x0300) -- grave
local acute = u(0x0301) -- acute
local circ = u(0x0302) -- circumflex
local dia = u(0x0308) -- diaresis
-- list of char classes
local accent = grave .. acute .. circ .. dia
local vowel = "aeiouyAEIOUY"
local cons = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQSTVWXZ"
local syll = "‧#"
-- put them into classes
local A = "[" .. accent .. "]" -- all accents
local V = "[" .. vowel .. "]" -- all vowels
local non_V = "[^" .. vowel .. "]" -- all non-vowels
local C = "[" .. cons .. "]" -- all consonants
local non_C = "[^" .. cons .. "]" -- all non-consonants
local CV = "[" .. cons .. vowel .. "]" -- all consonants and vowels
local S = "[" .. syll .. "]" -- any syllable boundary
-- list of valid trigraphs and digraphs, including diphthongs and long vowels
local graphemes = {
["aai"] = "ɑːɪ̯",
["eeu"] = "iʊ̯",
["ieu"] = "iʊ̯",
["oei"] = "uɪ̯",
["ooi"] = "oːɪ̯",
["aa"] = "ɑː",
["ae"] = "ɑː",
["ai"] = "aɪ̯",
["au"] = "œʊ̯",
["ee"] = "ɪə̯",
["ei"] = "əɪ̯",
["eu"] = "iʊ̯",
["ie"] = "į", -- temporary value
["oe"] = "ů", -- temporary value
["oi"] = "ɔɪ̯",
["oo"] = "ʊə̯",
["ou"] = "œʊ̯",
["ui"] = "uɪ̯",
["uu"] = "ü" -- temporary value
}
-- sort trigraphs and digraphs in descending order
local graphemes_sorted = {}
for k, _ in pairs(graphemes) do
table.insert(graphemes_sorted, k)
end
table.sort(graphemes_sorted, function(a, b) return len(a) > len(b) end)
-- list of various grapheme sets
local sets = {
["vowel_length"] = { -- long-short vowels
["a"] = {"a", "ɑː"},
["e"] = {"ɛ", "ɪə̯"},
["i"] = {"ə", "i"},
["o"] = {"ɔ", "ʊə̯"},
["u"] = {"œ", "y"}
},
["cons_voice"] = { -- voiced/voiceless consonants
{"b", "p"},
{"d", "t"},
{"ʤ", "ʧ"},
{"ɡ", "k"},
{"v", "f"},
{"z", "s"},
{"ʒ", "ʃ"},
}
}
-- list of defined affixes
local affixes = {
-- prefixes
["pre"] = {
{"aan"},
{"agter"},
{"be"},
{"deur"},
{"er"},
{"ge"},
{"her"},
{"om"},
{"ont"},
{"onder"},
{"van", pos = "d"},
{"ver"},
{"voor"}
},
-- suffixes
["suf"] = {
{"agtig"},
{"baar"},
{"dom"},
{"end"},
{"heid"},
{"lik"},
{"loos"},
{"nis"},
{"sel"},
{"skap"}
}
}
-- list of unstressed words
local unstressed = {
"die",
"dit",
"is",
"nie",
"'n"
}
-- list of stressed endings (mostly in loanwords)
local stressed_endings = {
"aal",
"aan",
"aans",
"aar",
"aard",
"aat",
"am",
"ant",
"asie",
"at",
"ee",
"eel",
"eem",
"een",
"eer",
"ees",
"eet",
"ein",
"ek",
{"el", orig = "loan"}, -- only in loanwords
"ent",
"es",
"et",
"eur",
"eus",
"eut",
"ieel",
"ief",
"iek",
"iel",
"iem",
"ien",
"ine",
"ier",
"iet",
{"o", orig = "fr"}, -- only in french loanwords
"oen",
"on",
"oof",
"oog",
"ooi",
"ool",
"oom",
"oon",
"oor",
{"sie", stress = "pre"},
"teek",
"teit",
"u",
"uum",
"uur",
"uus",
"uut",
"y",
"yn",
"ys"
}
-- list of respelling substitutions
local subs = {
-- 'N
{"#'n#", "#ə(n)#", "-"}, -- pronounced /ə(n)/ as the article 'n
{"'n#", "ən#", "-"}, -- pronounced /ən/ otherwise
-- CH
{"ch", "ʃ", "fr"}, -- pronounced /ʃ/ in french loans
{"sch", "sk", "-"}, -- pronounced /sk/ in the sequence "sch"
{"ch([" .. cons .. "]?[ei])", "χ%1", "-"}, -- pronounced /χ/ before optional consonant cluster and "e" or "i"
{"ch", "k", "-"}, -- otherwise /k/
-- NG
{"ng", "ŋ", "-"}, -- pronounced /ŋ/
-- SH/SJ
{"s[hj]", "ʃ", "-"}, -- pronounced /ʃ/
-- DJ/TJ
{"[dt]jie", "kį", "-"}, -- pronounced /-ci/ in the suffix "-djie"/"-tjie"
{"dj", "ʤ", "-"}, -- "dj" is otherwise /d͡ʒ/
{"tj", "ʧ", "-"}, -- "tj" is otherwise /t͡ʃ/
-- C
{"c([ei])", "s%1", "-"}, -- pronounced /s/ before "e" or "i"
{"c", "k", "-"}, -- otherwise /k/
-- GH
{"gh", "ɡ", "-"}, -- pronounced /ɡ/
-- G
{"g", "ɡ", "en"}, -- pronounced /ɡ/ in english loans
{"r‧ge", "r‧ɡe", "-"}, -- pronounced /ɡ/ between /r/ and /ə/
{"g", "χ", "-"}, -- otherwise /χ/
{"n(‧?[kɡ])", "ŋ%1", "-"}, -- /ŋ/ is an allophone of /n/ before /ɡ/ and /k/
-- V
{"v", "f", "af"}, -- pronounced /f/ in native words
-- W
{"w", "w", "en"}, -- pronounced /w/ in english loans
{"w", "v", "-"}, -- otherwise /v/
-- EAU
{"eaux?", "OU", "fr"}, -- pronounced /œʊ̯/ in french loans
-- OI
{"oi", "wA", "fr"}, -- pronounced /wa/ in french loans
-- IJ
{"ij(" .. non_V .. ")", "EI%1", "-"}, -- pronounced /əɪ̯/ in dutch-based names
-- X
{"#x", "#s", "-"}, -- pronounced /s/ word-initially
{"x", "ks", "-"}, -- otherwise /ks/
-- H
{"(" .. CV .. ")h", "%1", "-"}, -- silent if part of consonant digraph or syllable-final
{"h", "ɦ", "-"}, -- otherwise /ɦ/
-- O
{"o(" .. S .. ")", "OU%1", "en"}, -- pronounced /œʊ̯/ in open syllables in english loans
{"o#", "ů#", "-"}, -- otherwise /u/ in word-final position
-- U
{"u(" .. C .. ")", "A%1", "en"}, -- pronounced /a/ in closed syllables in english loans
{"u", "jů", "en"}, -- otherwise /ju/ in english loans
-- Y
{"y", "j", "en"}, -- pronounced /j/ in english loans
{"y", "EI", "-"}, -- otherwise /əɪ̯/
-- circumflex accent
{circ, "ː", "-"} -- lengthens a vowel with its short quality
}
-- canonicalisation function
local function canonicalise(text)
-- decompose accents
text = decomp(text)
-- make text lowercase
text = lower(text)
-- remove extrenous spaces
text = rsub(text, "%s+", " ")
text = rsub(text, "^ ", "")
text = rsub(text, " $", "")
-- treat commas as a pause
text = rsub_repeatedly(text, "%s*,%s*", " | ")
-- return as array of words
return split(text, " ")
end
-- hyphen return function
local function is_hyphen_depth(depth)
return (depth == 1) and "%-" or ""
end
-- affix validation function
local function is_valid_affix(string, affix, pos, affix_type, depth)
-- not valid if pos restriction exists and no match
if affix.pos and not find(pos, affix.pos) then
return false
end
-- match hyphen at appropriate depth
local hyphen = is_hyphen_depth(depth)
-- match appropriate pattern
local pattern = affix_type == "pre" and "^" .. affix[1] .. hyphen or hyphen .. affix[1] .. "$"
return true and find(string, pattern) or false
end
-- affix application function
local function apply_affixes(string, depth, affixes, pos)
-- match hyphen at appropriate depth
local hyphen = is_hyphen_depth(depth)
-- process prefixes
for _, affix in ipairs(affixes.pre) do
if is_valid_affix(string, affix, pos, "pre", depth) then
string = rsub(string, "^" .. affix[1] .. hyphen, affix[1] .. ">")
break
end
end
-- process suffixes
for _, affix in ipairs(affixes.suf) do
if is_valid_affix(string, affix, pos, "suf", depth) then
string = rsub(string, hyphen .. affix[1] .. "$", "<" .. affix[1])
break
end
end
return string
end
-- components parsing function
local function split_components(word, depth, etyl, pos)
-- initialise some variables
depth = depth or 0
pos = pos or ".*"
-- depth 0: handle double hyphen compounds first
if depth == 0 then
local parts = split(word, "%-%-")
if #parts > 1 then
local result = {}
for _, part in ipairs(parts) do
table.insert(result, split_components(part, depth + 1, etyl, pos))
end
return table.concat(result, "--")
else
return split_components(word, depth + 1, etyl, pos)
end
end
-- depth 1: handle single hyphen compounds and hyphenated affixes
if depth == 1 then
-- mark ambiguously prefix and suffixes explicitly with a hyphen with > and < respectively
word = apply_affixes(word, depth, affixes, pos)
local parts = split(word, "%-")
if #parts > 1 then
local result = {}
for _, part in ipairs(parts) do
table.insert(result, split_components(part, depth + 1, etyl, pos))
end
return table.concat(result, "-")
else
return split_components(word, depth + 1, etyl, pos)
end
end
-- depth 2: handle non-hyphenated affixes
if depth == 2 then
-- add > and < for prefix and suffixes respectively
word = apply_affixes(word, depth, affixes, pos)
return word
end
return word
end
-- component generation function
local function to_components(words, etyl, pos)
-- loop over every word
local results = {}
for _, word in ipairs(words) do
-- get term as split components
local w = split_components(word, 0, etyl, pos)
table.insert(results, "#" .. w .. "#")
end
-- join processed words
return table.concat(results, " ")
end
-- syllabification function (FIXME: work on IPA and syllabification separately
local function syllabify(term, etyl, pos)
-- remove diaresis and split syllable (note: diaresis shouldn't be displayed in its hyphenation form)
term = rsub(term, "(" .. V .. ")" .. dia, "‧%1")
-- mark trigraphs and digraphs with curly braces
for _, graph in ipairs(graphemes_sorted) do
term = rsub(term, graph, "{" .. graph .. "}")
end
-- add dot before consonant + vowel
term = rsub(term, "(" .. C .. "?{?" .. V .. A .. "?)", "‧%1")
-- remove any dots inside brackets
term = rsub(term, "{[^}]*}", function(a) return rsub(a, "‧", "") end)
-- shift dot before certain consonant clusters and digraphs
term = rsub(term, "([bcfgkpvw])‧l", "‧%1l") -- clusters with l
term = rsub(term, "([bcdfgkptwv])‧r", "‧%1r") -- clusters with r
term = rsub(term, "([dst])‧j", "‧%1j") -- digraphs with j
term = rsub(term, "([ckgt])‧h", "‧%1h") -- digraphs with h
term = rsub(term, "n‧g", "ng‧") -- ng is syllable-final
-- term = rsub(term, ">s‧", ">‧s") -- s can form a cluster after a prefix
-- remove leading dots and brackets
term = rsub(term, "#(" .. non_V .. "*)‧", "#%1")
term = rsub(term, "%.", "‧")
term = rsub(term, "[{}-]", "") -- comment out to debug
return rsub(term, "‧+", "‧")
end
-- hyphenation function
function export.hyphenation(term, etyl, pos)
-- get user input as table
if type(term) == "table" then
term = term.args[1]
end
-- mark all word borders
term = rsub(term, "([^ ]+)", "#%1#")
-- format hyphenation
-- local data = { lang = lang, sc = sc, hyphs = {{hyph = rsub(syllabify(term), "[#%[%]<>]", ""), "%.")}} }
-- return hyphen.format_hyphenations(data)
return rsub(recomp(syllabify(term)), "[#%[%]<>]", "")
end
-- generate substitutions function
local function generate_subs(term, etyl, pos)
local to_sub = {}
local seen_patterns = {}
for _, s in ipairs(subs) do
local s_patt, s_repl, s_etyl = s[1], s[2], s[3]
-- only add if pattern wasn't added already
if not seen_patterns[s_patt] then
-- add substitution for etymology-specific rules
if etyl ~= "-" and s_etyl == etyl then
table.insert(to_sub, {s_patt, s_repl})
seen_patterns[s_patt] = true
-- otherwise add substitution for default rules
elseif s_etyl == "-" then
table.insert(to_sub, {s_patt, s_repl})
seen_patterns[s_patt] = true
end
end
end
return to_sub
end
-- stress assignment function
local function stress(term, etyl, pos)
-- words with certain endings are syllable-final stressed
for _, ending in ipairs(stressed_endings) do
if find(term, ending .. "#") then
if ending == "el" then -- "-el" is only stressed in loanwords
if not etyl and etyl ~= "af" then
return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
else
break
end
elseif ending == "o" then -- "-o" is only stressed in french loanwords
if etyl == "fr" then
return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
else
break
end
else
return rsub(term, ending .. "#", "ˈ" .. ending .. "#")
end
end
end
-- add stress mark to first syllable if no ending was stressed
return rsub(term, "^#", "#ˈ")
end
-- pronunciation function
local function toIPA(text, etyl, pos)
-- canonicalise term as array of words
local words = canonicalise(text)
-- mark text with appropriate components
local term = to_components(words, etyl, pos)
-- add stress to term
-- term = stress(term, etyl, pos)
-- syllabify term
-- term = syllabify(term, etyl, pos)
-- shift stress rightwards to a syllable boundary
-- term = rsub(term, "([^" .. syll_boundary .. "]*)ˈ", "ˈ%1")
--[[
-- prepare table to substitute the appropriate phonemes based on etymology and part of speech
local to_sub = generate_subs(term, etyl, pos)
-- go over substitution table
for _, s in ipairs(to_sub) do
local k, v = s[1], s[2]
rsub(term, k, v)
end
-- make text lowercase again
term = lower(term)
-- substitute graphemes
for graph, phoneme in pairs(graphemes) do
term = rsub(term, graph, phoneme)
end
-- substitute single-letter vowels
term = rsub(term, "([aeiou])([‧#ː" .. cons .. "])", function(a, b)
if match("[‧#]", b) then
return sets.vowel_length[a][2] .. b -- for open syllables
else
return sets.vowel_length[a][1] .. b -- for closed syllables
end
end)
-- replace į, ů, ü with their actual phonetic values
term = rsub(term, "[įůü]", {["į"] = "i", ["ů"] = "u", ["ü"] = "y"})
-- remove double consonants
term = rsub(term, "(.)(‧?)%1", "%2%1")
]]--
-- final adjustments
term = rsub(term, "‧", ".")
return rsub(term, "[#%[%]]", "")
end
-- main export function
function export.show(term, etyl, pos)
-- get user input as table
if type(term) == "table" then
term = term.args[1]
end
return toIPA(term, etyl, pos)
end
return export