teclyn

#!/bin/bash

###
# usage: ./teclyn -[a, b, d, e, f, i, n, s, t] OPTION
# -a gair
#       adds 'gair'  to dictionary (may include
#       dictionary tags on the end e.g.  /AFG).
# -b
#       builds the zipped  libreoffice package.
# -d
#	git diff the dictionary file and  print
#	only  the  words  that   have   changed
#	(usually they have been added).
# -e gair
#       use 'wordforms' from hunspell to expand
#       the   given  word  with  all   relevant
#       permitted  affixes in  the  dictionary.
# -f gair
#	find the word or word  fragment  in the
#	dictionary  file and print all matches:
#	accepts regex.
# -i
#	repeatedly  prompt for words,  then  -a
#	add them (this first checks if they are
#	in  the  dictionary   already,  and  by
#	default auto-tags with treigladau tags.
# -n rhestr_o_eiriau.txt
#	search through a list of words (one per
#	line) to find ones unknown to the  dict
#	and add them to an output file.
# -s gair
#	check  whether  word  'gair'  (or words
#	in file 'gair') are in the dictionary.
# -t
#	sort the .dic file  alphabetically  and
#	add the number of lines to the top.
###

PINK='\033[95m'
BLUE='\033[94m'
YELLOW='\033[93m'
GREEN='\033[92m'
RED='\033[91m'
ENDC='\033[0m'

parse () {
	###
	# get command line options
	###
	case "$1" in
		-a|--add)
			ychwanegu "$2"
			shift
			;;
		-b|--build)
			adeiladu
			shift
			;;
		-d|--diff)
			geiriau_a_newidiwyd
			shift
			;;
		-e|--expand)
			ystyried "$2"
			shift
			;;
		-f|--find)
			ffeindio "$2"
			shift
			;;
		-i|--insert)
			insertmode "$2"
			shift
			;;
		-n|--new)
			newydd "$2"
			shift
			;;
		-s|--spell)
			gwirio "$2"
			shift
			;;
		-t|--tidy)
			tacluso "dictionaries/cy_GB.dic"
			shift
			;;
		--)
			shift
			break
			;;
		*)
			echo "usage: ./teclyn -[a, b, e, f]
-a gair
	adds 'gair'  to dictionary (may include
	dictionary tags on the end e.g.  /AFG).
-b
	builds the zipped  libreoffice package.
-d
	git diff the dictionary file and  print
	only  the  words  that   have   changed
	(usually they have been added).
-e gair
	use 'wordforms' from hunspell to expand
	the   given  word  with  all   relevant
	permitted  affixes in  the  dictionary.
-f gair
	find the word or word  fragment  in the
	dictionary  file and print all matches:
	accepts regex
-i
	repeatedly  prompt for words,  then  -a
	add them (this first checks if they are
	in  the  dictionary   already,  and  by
	default auto-tags with treigladau tags.
-n rhestr_o_eiriau.txt
	search through a list of words (one per
	line) to find ones unknown to the  dict
	and add them to an output file.
-s gair
	check  whether  word  'gair'  (or words
	in file 'gair') are in the dictionary.
-t
	sort the .dic file  alphabetically  and
	add the number of lines to the top."

			exit 3
			;;
	esac
}

ychwanegu () {
	###
	# add a word to the dictionary, optionally with prefix / suffix tags
	###
	autotag=1
	# remove any dictionary tags from the end i.e. rhywedd/M becomes rhywedd
	gair=$(sed -e "s|/.*||" <<< "$1")
	dictgair="$1"
	# print out matches or closest words
	gwirio "$gair"
	# empty if hunspell recognises the word
	anhysbys=$(hunspell -L -d dictionaries/cy_GB <<< "$gair")

	if [[ -n "$anhysbys" ]] ; then
		# offer to autotag word if tags were not found on the word
		if [[ $autotag -eq 1 ]] && [[ "$gair" != "$dictgair" ]] ; then
			echo "Not autotagging, tags already added."
			autotag=0
		elif [[ $autotag -eq 1 ]] ; then
			llaes=""
			meddal=""
			trwynol=""
			hprosth=""
			# mutations are only possible on certain word beginnings
			if [[ "$gair" =~ (^[ctpCTP][^hH].*) ]] ; then llaes="L" ; fi
			if [[ "$gair" =~ (^[ctpCTP][^hH].*|^[lL][lL].*|^[rR][hH].*|^[dD][^dD].*|^[gG].*|^[bB].*|^[Mm].*) ]] ; then meddal="M" ; fi
			if [[ "$gair" =~ (^[ctpCTP][^hH].*|^[dD][^dD].*|^[gG].*|^[bB].*) ]] ; then trwynol="T" ; fi
			if [[ "$gair" =~ ^[aeiouwyAEIOUWYâêîôûŵŷÂÊÎÔÛŴŶ].* ]] ; then hprosth="H" ; fi
			tags="/$llaes$meddal$trwynol$hprosth"
			if [[ "$tags" == "/" ]] ; then
				printf "Add word \"$gair\" with no tags detected? [y/n]"
			else
				dictgair="$gair$tags"
				printf "Add word \"$gair\" with tags $tags , or (a) add without tags? [y/n/a]"
			fi
			# get user response whether to add, use autotag variable to permit writing to dictionary
			read -n 1 -p " " ateb
			echo
			# will only write anything to dict if autotag != 1
			if [[ "$ateb" =~ [yY] ]] ; then autotag=0 ; fi
			if [[ "$ateb" =~ [aA] ]] ; then autotag=0 ; dictgair="$gair" ; fi
		fi
		# Add word
		if [[ $autotag -ne 1 ]] ; then
			echo "Writing word \"$dictgair\" to dictionary"
			echo "$gair" >> playground/word_list.txt
			echo "$dictgair" >> dictionaries/cy_GB.dic
			# Write number of lines to tmp
			tail -n +2 dictionaries/cy_GB.dic | sort -u | wc -l > dictionaries/dict.tmp
			# Write sorted words to tmp
			tail -n +2 dictionaries/cy_GB.dic | WC_ALL=C sort -u >> dictionaries/dict.tmp
			# Copy tmp back to mwy.dic
			mv dictionaries/dict.tmp dictionaries/cy_GB.dic
			echo "Added word."
		else
			echo "Not adding word."
		fi
	else
		printf ''
		#echo "\"$gair\" is already in dictionary"
	fi

	# contains any unknown words in the word list
	anhysbys=$(hunspell -L -d dictionaries/cy_GB playground/word_list.txt)
	if [[ -n "$anhysbys" ]] ; then
		echo "The dictionary does not recognise the following:"
		echo "$anhysbys"
	fi
}

adeiladu () {
	###
	# make the libreoffice extension
	###
	echo "Building..."
	#clean the folder of oxts (which are just renamed zips)
	mv *.oxt archive/ 2> /dev/null

	# figure out which version we're building
	version="$( sed -n 's|.*<version value="\(.*\)" />|\1|p' description.xml )"

	# zip the latest files
	zip -qr libreoffice-geiriadur-$version.oxt description/ META-INF/ dictionaries/cy_GB.dic dictionaries/cy_GB.aff baner_libreoffice_cymreig.svg description.xml dictionaries.xcu LICENSE README.txt

	echo "Built."
}

geiriau_a_newidiwyd () {
	###
	# print out the dic word count, then the stems of any new words added
	# (i.e. everything before the '/' on any lines that have changed)
	###
	git diff dictionaries/cy_GB.dic | grep +[a-zA-Z0-9] | grep -v @ | sed -e 's/^+//g' -e 's/\/[a-zA-Z]*//g'
}

ystyried () {
	###
	# expand a word with all available relevant affixes: the below is modified from 'wordforms'
	###
	./wordforms ./dictionaries/cy_GB.aff ./dictionaries/cy_GB.dic $1
	exit_val=$?
	if [[ $exit_val -ne 0 ]] ; then
		suggest="$(hunspell -d ./dictionaries/cy_GB -a <<< $1 2>/dev/null | sed -e 's/^@.*$//' -e 's/^.*:https://' | tr -d '\n')"
		if [[ "$suggest" == "*" ]] ; then
			echo Dyw "$1" ddim yn bôn ond mae yn y geiriadur 1>&2
		elif [[ "${suggest:0:1}" == "#" ]] ; then
			echo Dyw "$1" ddim yn y geiriadur 1>&2
		else
			printf "Oeddych chi'n golygu: $suggest\n" 1>&2
		fi
		return 1
	fi
}

ffeindio () {
	###
	# find dictionary entry matching search term (basically just grep)
	###
	cat dictionaries/cy_GB.dic | grep "$1"
}

insertmode () {
	while :
	do
		echo
		printf "Word: "
		read word
		ychwanegu "$word"
	done
}

newydd () {
	###
	# search through a list of words and add the unknown ones to a file
	###
	touch dictionaries/newydd.txt
	hunspell -d dictionaries/cy_GB -L dictionaries/newydd.txt > dictionaries/newydd.tmp
	hunspell -d dictionaries/cy_GB -L $1 >> dictionaries/newydd.tmp
	sort -u dictionaries/newydd.tmp > dictionaries/newydd.txt
	rm dictionaries/newydd.tmp
}

gwirio () {
	###
	# check whether a word is in the dictionary
	# if $1 is a file, check through it for non
	# -english words that aren't in the cymraeg
	# dictionary either
	###
	# TODO: backslash-escaped filenames with spaces don't work with the -f
	if [ -f "$1" ] ; then
		echo "Checking file: $1" 1>&2
		to_check="$(cat $1)"
		echo "Processing for Cymraeg errors..." 1>&2
		cymraeg_errors="$(hunspell -l -d dictionaries/cy_GB <<< ${to_check[@]})"
		echo "Filtering out English words..." 1>&2
		errors="$(hunspell -l -d en_GB <<< ${cymraeg_errors[@]})"
		echo "Errors:" 1>&2
		echo " " 1>&2
		echo "${errors[@]}" | sort -u | uniq -u | tr ' ' '\n'
	else
		response=$(tail -n1 <<< "$(echo $1 | hunspell -d dictionaries/cy_GB -a)")
		case ${response:0:1} in
			"*")
				echo "Correct, in dictionary."
				shift
				;;
			"&")
				echo -e "Closest matches:\t$(sed 's/^.*[:] //g' <<< $response)"
				shift
				;;
			"+")
				echo "Found using affix removal."
				shift
				;;
			"-")
				echo "Found, compound."
				shift
				;;
			"#")
				echo "Unknown word, no suggestions."
				shift
				;;
		esac
	fi
}

tacluso () {
	###
	# tidy the dic file without adding or removing lines
	###
	tail -n +2 $1 | sort -u | wc -l > dictionaries/dict.tmp
	tail -n +2 $1 | WC_ALL=C sort -u >> dictionaries/dict.tmp
	mv dictionaries/dict.tmp $1
}

smallflags () {
	# abc/ABCabc format typically
	# capture what this returns for just the small flags on a line
	intext=$1
	# catch no flags, otherwise return lowercase flags
	if [[ $( hasflags $intext ) -ne 1 ]] ; then
		echo $( sed -e "s|.*/||" -e "s|[A-Z]*||g" <<< $intext )
	else
		echo ""
	fi
}

capsflags () {
	# abc/ABCabc format typically
	# capture what this returns for just the capital flags on a line
	intext=$1
	# return uppercase flags, catch no flags
	if [[ $( hasflags $intext ) -ne 1 ]] ; then
		echo $( sed -e "s|.*/||" -e "s|[a-z]*||g" <<< $intext )
	else
		echo ""
	fi
}

noflags () {
	# abc/ABCabc format typically
	# capture what this returns for just the line without flags or delimiter /
	echo $( sed -e "s|/.*||" <<< $1 )
}

allflags () {
	# abc/ABCabc format typically
	# capture what this returns for just the flags without line or delimiter /
	intext=$1
	if [[ $( hasflags $intext ) -ne 1 ]] ; then
		echo $( sed -e "s|.*/||" <<< $intext )
	else
		echo ""
	fi
}

addflag () {
	# return in abc/ABCabc format
	# capture what this returns for the same line with
	# the flag added to the corresponding flag grouping
	# i.e. assuming input "abc/ABcde X" this will return
	# abc/ABXcde, or "abc/ABcde x" for abc/ABcdex
	intext=$1
	inflag=$2
	outflags=$( allflags $intext )"$inflag"
	#echo $( noflags $intext )"/"$( capsflags $intext )$( sed "s|[a-z]*||g" <<< $inflag )$( smallflags $intext )$( sed "s|[A-Z]*||g" <<< $inflag )
	echo $( noflags $intext )"/"$( sort_flags $outflags )
}

hasflags () {
	case $1 in
		*/?* ) return 0 ;;
		*/   ) return 2 ;;
		*    ) return 1 ;;
	esac
}

sort_flags () {
	###
	# give it a string of flags to sort
	###
	caps=$( sed -e "s|.*/||" -e "s|[a-z]*||g" <<< $1 )
	smalls=$( sed -e "s|.*/||" -e "s|[A-Z]*||g" <<< $1 )
	caps=$( echo $caps | grep -o . | sort | tr -d "\n" )
	smalls=$( echo $smalls | grep -o . | sort | tr -d "\n" )
	echo $caps$smalls
}

get_flags () {
	# get_flags "abc/ABC"
	# returns array of the flags
	intext="$( sed 's|.*/||' <<< $1 )"
	flags=()
	flag_type="$( get_flag_type dictionaries/cy_GB.aff )"
	if [[ "$flag_type" == "num" ]] ; then
		flags+=( ${intext//,/ } )
	elif [[ "$flag_type" == "long" ]] ; then
		for (( i=0 ; i*2<${#intext} ; i++ )); do
			flags+=( ${intext:((i*2)):2} )
		done
	elif [[ "$flag_type" == "unicode" ]] ; then
		for (( i=0 ; i<${#intext} ; i++ )); do
			flags+=( ${intext:i:1} )
		done
	fi
	echo "${flags[@]}"
}

get_flag_type () {
	# get_flag_type path/to/dict.aff
	# returns 'long', 'num' or default 'unicode'
	type=$( awk '/^FLAG/ {print $2}' $1 )
	if [[ "$type" == "" ]] ; then echo "unicode" ; else echo "$type" ; fi
}

get_affixes_in_affix () {
	# get_affixes_in_affix path/to/dict.aff flag_name
	# return_type:
	# =0 return array of flag names
	# =1 return regex that matches any flag name
	return_type=1
	affixes=()
	flag_type="$( get_flag_type $1 )"
	#flag_type="num"
	awk_affixes=( $( awk -v "flag=$2" '
		$0 ~ "^[SP]FX "flag && NF > 4 && $4 ~ ".*/.*" {split($4,a,"/"); print a[2]}
		' $1 ) )
	[ "$awk_affixes" == "" ] && return 1
	for awk_affix in ${awk_affixes[@]} ; do
		if [[ "$flag_type" == "num" ]] ; then
			affixes+=( ${awk_affix//,/ } )
		elif [[ "$flag_type" == "long" ]] ; then
			for (( i=0 ; i*2<${#awk_affix} ; i++ )); do
				affixes+=( ${awk_affix:((i*2)):2} )
			done
		elif [[ "$flag_type" == "unicode" ]] ; then
			for (( i=0 ; i<${#awk_affix} ; i++ )); do
				affixes+=( ${awk_affix:i:1} )
			done
		fi
	done
	# remove duplicates
	affixes=( $( printf "%s\n" "${affixes[@]}" | sort -u ) )
	if [[ $return_type -eq 0 ]] ; then
		echo "${affixes[@]}"
	else
		echo "$( flag_regex $1 ${affixes[@]} )"
		#regex="("
		#[[ "$flag_type" == "unicode" ]] && regex="["
		#for aff in ${affixes[@]} ; do
		#	regex="$regex$aff"
		#	[[ "$flag_type" != "unicode" ]] && regex="$regex"'|'
		#done
		#if [[ "$flag_type" == "unicode" ]] ; then regex="$regex]" ; else regex="${regex::-1})" ; fi
		#echo "$regex"
	fi
}

flag_regex () {
	# flag_regex path/to/dict.aff array_of_flags
	# returns regex matching the flags
	flags=("$@")
	unset 'flags[0]'
	[[ "${flags[@]}" == "" ]] && return 1
	flag_type="$( get_flag_type $1 )"
	regex="("
	[[ "$flag_type" == "unicode" ]] && regex="["
	for flag in ${flags[@]} ; do
		regex="$regex$flag"
		[[ "$flag_type" != "unicode" ]] && regex="$regex|"
	done
	if [[ "$flag_type" == "unicode" ]] ; then regex="$regex]" ; else regex="${regex::-1})" ; fi
	echo "$regex"
}

separate_affixes () {
	# deal with affixes differently depending on affix type
	# num: split at ,
	# long: split at every 2nd char
	# unicode: split at every char
:
}

checkmutationflags () {
	# capture what this returns to check if the line has
	# "normal" mutation rules applied to it or not
	line=$1
	flags=$( capsflags $line )
	lemma=$( noflags $line )
	lemma=${lemma,,} #lowercase
	# strip out everything except the mutation flags
	mutflags=$( sed "s|[^LMTH]||g" <<< $flags )
	# sort flags alphabetically
	mutflags=$( echo $mutflags | grep -o . | sort | tr -d "\n" )
	# now figure out the expected mutation flags and
	# compare with the mutation flags we actually have
	expflags=""
	if [[ ${lemma::1} =~ [aeiouwyâêîôûŵŷï] ]] ; then expflags="H" ; fi
	if [[ ${lemma::1} =~ [gbm] ]] || [[ ${lemma::2} =~ d[^d] ]] || [[ ${lemma::2} =~ ll ]] || [[ ${lemma::2} =~ rh ]] || [[ ${lemma::2} =~ [cpt][^h] ]] ; then expflags=$expflags"M" ; fi
	if [[ ${lemma::1} =~ [gb] ]] || [[ ${lemma::2} =~ [cpt][^h] ]] || [[ ${lemma::2} =~ [d][^d] ]] ; then expflags=$expflags"T" ; fi
	if [[ ${lemma::2} =~ [cp][^h] ]] || [[ ${lemma::2} =~ [t][^hs] ]] ; then expflags=$expflags"L" ; fi
	expflags=$( echo $expflags | grep -o . | sort | tr -d "\n" )
	if [[ "$mutflags" == "$expflags" ]] ; then echo 0 ; else echo 1 ; fi
}

parse $1 $2

#while read line ; do
#	flagless="$( sed 's|ion/.*||' <<< $line )"
#	shortened="$( grep $flagless'/' ionless )"
#	if [[ "$shortened" == "" ]] ; then echo -e $YELLOW"----- skipping $line"$ENDC ; continue; fi
#	echo -e $BLUE"$line"$ENDC
#	orig_flags="$( sed 's|.*/||' <<< $shortened )"
#	out_flags="$( addflag $shortened C )"
#	echo "$flagless"
#	echo -e $RED"$shortened"$ENDC
#	echo -e $GREEN"$out_flags"$ENDC
#	echo
#done < ion_muts_only

# while read line ; do
# 	flagless="$( sed 's|/.*||' <<< $line )"
# 	#shortened="$( grep $flagless'/' ionless )"
# 	#if [[ "$shortened" == "" ]] ; then echo -e $YELLOW"----- skipping $line"$ENDC ; continue; fi
# 	echo -e $BLUE"$line"$ENDC
# 	orig_flags="$( sed 's|.*/||' <<< $line )"
# 	out_with_flags="$( addflag $line R )"
# 	echo "$flagless"
# 	echo -e $RED"$flagless/$orig_flags"$ENDC
# 	echo -e $GREEN"$out_with_flags"$ENDC
# 	sed -i 's|'"$flagless"'ol.*$||' dictionaries/cy_GB.dic
# 	sed -i 's|'"$flagless/$orig_flags"'|'"$out_with_flags"'|' dictionaries/cy_GB.dic
# 	echo
# done < ol_matches