#Usage:
# awk [-v Dictionaries="sysdict1 sysdict2 ..."] -f spell.awk -- \
# [=suffixfile1 =suffixfile2 ...] [+dict1 +dict2 ...] \
# [-strip] [-verbose] [file(s)]
BEGIN { initialize() }
{ spell_check_line() }
END { report_exception() }
function initialize()
{
NonWordChars="[^" "'" "A-Z" "a-z" "]"
get_dictionaries()
scan_options()
load_dictionaries()
load_suffixes()
order_suffixes()
}
function get_dictionaries( key, files)
{
#Try to get the dictionaries string from environment variable
#Whenever awk starts, it will inherit the environment variable
#and store in "ENVIRON", which is one associative array
#Index is variable name, mapping to the variable value
if ((Dictionaries == "") && ("DICTIONARIES" in ENVIRON))
Dictionaries = ENVIRON["DICTIONARIES"]
#If dictionaries doesn't contain anything, we setup our own
#Dictionary Files
if (Dictionaries == "")
{
# DictionaryFiles["/usr/share/dict/american-english"]++
# DictionaryFiles["/usr/share/dict/british-english"]++
}
else
{
#This will split the string by space, put each item into
#files array. Array index starts from 1
split(Dictionaries, files)
for(key in files)
DictionaryFiles[files[key]]++
}
}
function scan_options( k)
{
#Handle the awk input argument in this function
#ARGC is the number of all arguments
#ARGV is the array of all arguments, index starting from 0
#and the first item is command itself "awk", so we starts
#from the 2nd argument(index starts from 1)
#We have to setup each "ARGV[k]" to empty string, otherwise
#awk will take it as input file and then look for this file
#Normally it will complain "no such file or directory"
for(k = 1; k < ARGC; k++)
{
if(ARGV[k] == "-strip")
{
#If the argument is "-strip", then setup the global
#config variable "Strip"
ARGV[k]=""
Strip=1
}
else if(ARGV[k] == "-verbose")
{
#If the argument is "-verbose", then setup the global
#config variable "Verbose"
ARGV[k]=""
Verbose=1
}
else if(ARGV[k] ~ /^=/)
{
#If the argument starts with "=", then increase the
#NSuffixFiles and associative array SuffixFiles
NSuffixFiles++
SuffixFiles[substr(ARGV[k],2)]++
ARGV[k]=""
}
else if(ARGV[k] ~ /^[+]/)
{
#If the argument starts with "+", then increase the
#item in associative array DictionaryFiles
DictionaryFiles[substr(ARGV[k], 2)]++
ARGV[k]=""
}
}
#Remove trailing empty arguments(for nawk)
#For nawk, if there is empty arguments left in the end, it won't
#read value from standard input, so we need to decrease ARGC, until
#meeting with one non-empty argument
while ((ARGC > 0) && (ARGV[ARGC-1] == ""))
ARGC--
}
function load_dictionaries()
{
#Iterate each file in DictionaryFiles, for each
#file, read each line as a word, and save the word
#at associative array "Dictionary"
for(file in DictionaryFiles)
{
while((getline word < file) > 0)
Dictionary[tolower(word)]++
close(file)
}
}
function load_suffixes( file, k, line, n, parts)
{
#If number of suffix files is larger than 0, then we iterate
#SuffixFiles, read each suffix rule line from each file
if(NSuffixFiles > 0)
{
for(file in SuffixFiles)
{
while((getline line < file) >0)
{
#For each "rule", strip comments, leading whitespace
#and trailing whitespace
sub(" *#.*$", "", line) # strip comments
sub("^[ \t]+", "", line) # strip leading whitespace
sub("[ \t]+$", "", line) # strip trailing whitespace
if(line == "")
continue
#Split each items in line, assign items into array parts
#Save first item (suffix) into array Suffixes
#Save remaining items(replacement) into array Replacement
n=split(line, parts)
Suffixes[parts[1]]++
Replacement[parts[1]]=parts[2]
for(k=3; k<=n; k++)
Replacement[parts[1]]=Replacement[parts[1]] " " parts[k]
}
close(file)
}
}
else
{
#If user doesn't specify the replacement file, setup default
#suffix rules
Suffixes["ed"]=1;
Suffixes["ing"]=1;
Replacement["ed"]="\"\" e"
Replacement["ing"]="\"\""
}
}
function order_suffixes( i, j, key)
{
#Save all suffixes into array OrderedSuffix
NOrderedSuffix=0
for(key in Suffixes)
OrderedSuffix[++NOrderedSuffix] = key
#Sort the OrderedSuffix, make it be from long
#to short
for(i=1; i<NOrderedSuffix;i++)
for(j=i+1; j<=NOrderedSuffix;j++)
if(length(OrderedSuffix[i]) < length(OrderedSuffix[j]))
swap(OrderedSuffix, i, j)
}
function swap(a,i,j, temp)
{
temp = a[i]
a[i] = a[j]
a[j] = temp
}
function spell_check_line( k, word)
{
#For each record line, we replace the non word characters
#with white spaces
gsub(NonWordChars, " ")
#Iterate each record, strip leading and trailing apostrophies
#then call spell check method
for(k=1;k <= NF;k++)
{
word=$k
sub("^'+", "", word) #strip leading apostrophies
sub("'+$", "", word) #strip trailing apostrophies
if(word != "")
spell_check_word(word)
}
}
function spell_check_word(word, key, lc_word, location, w, wordlist)
{
#Convert the input word to lowercase, and check in Dictionary
#associative arrays
lc_word=tolower(word)
if(lc_word in Dictionary)
return
else
{
#If not found in Dictionary associative arrays, then strip
#the suffix if user specified to do that, and check words
#after stripping suffixes in Dictionary
if(Strip)
{
strip_suffixes(lc_word, wordlist)
for(w in wordlist)
if(w in Dictionary)
return
}
#If the word still doesn't get found at Dictionary after stripping
#off the suffix, then we shall save the word into array Exception
location = Verbose ? (FILENAME ":" FNR ":") : ""
if(lc_word in Exception)
Exception[lc_word] = Exception[lc_word] "\n" location word
else
Exception[lc_word] = location word
}
}
function strip_suffixes(word, wordlist, ending, k, n, regexp)
{
#wordlist array is used to save all words generated after stripping
#suffix. In the beginning, we use split to clear up the wordlist
split("", wordlist)
#Iterate each suffix. For each suffix, if it matches with the word
#in the end, we strip the end by using substr. RSTART is the number
#indicating from where regexp starts matching.
for(k=1; k <= NOrderedSuffix; k++)
{
regexp=OrderedSuffix[k]
if(match(word, regexp))
{
word=substr(word, 1, RSTART-1)
#We check the Replacement associative array, if there is no
#replacement, then we save the origianl word in "wordlist"
#otherwise, we split items in Replacement string, add each
#item to end-stripped word, generating new word, and save
#into the wordlist
if(Replacement[regexp] == "")
wordlist[word]=1
else
{
split(Replacement[regexp], ending)
for(n in ending)
{
if(ending[n] == "\"\"")
ending[n] = ""
wordlist[word ending[n]]=1
}
}
break
}
}
}
function report_exception()
{
for(key in Exception)
print Exception[key]
}
awk program is compiled into compact internal representation and then interpreted at runtime by a virtual machine.
But its built-in function is implemented by underlying language, currently in C.
No comments:
Post a Comment