Sunday, July 20, 2014

awk: spell checker example

 #Usage:  
 #  awk [-v Dictionaries="sysdict1 sysdict2 ..."] -f spell.awk -- \  
 #    [=suffixfile1 =suffixfile2 ...] [+dict1 +dict2 ...] \  
 #    [-strip] [-verbose] [file(s)]  
   
 BEGIN { initialize() }  
    { spell_check_line() }  
 END { report_exception() }  
   
 function initialize()  
 {  
   NonWordChars="[^" "'" "A-Z" "a-z" "]"  
   
   get_dictionaries()  
   scan_options()  
   load_dictionaries()  
   load_suffixes()  
   order_suffixes()  
 }  
   
 function get_dictionaries(    key, files)  
 {  
   #Try to get the dictionaries string from environment variable  
   #Whenever awk starts, it will inherit the environment variable  
   #and store in "ENVIRON", which is one associative array  
   #Index is variable name, mapping to the variable value  
   if ((Dictionaries == "") && ("DICTIONARIES" in ENVIRON))  
     Dictionaries = ENVIRON["DICTIONARIES"]  
   
   #If dictionaries doesn't contain anything, we setup our own  
   #Dictionary Files  
   if (Dictionaries == "")  
   {  
 #    DictionaryFiles["/usr/share/dict/american-english"]++  
 #    DictionaryFiles["/usr/share/dict/british-english"]++  
   }  
   else  
   {  
     #This will split the string by space, put each item into  
     #files array. Array index starts from 1  
     split(Dictionaries, files)  
     for(key in files)  
       DictionaryFiles[files[key]]++  
   }  
 }  
   
 function scan_options(   k)  
 {  
   #Handle the awk input argument in this function  
   #ARGC is the number of all arguments  
   #ARGV is the array of all arguments, index starting from 0  
   #and the first item is command itself "awk", so we starts  
   #from the 2nd argument(index starts from 1)  
   
   #We have to setup each "ARGV[k]" to empty string, otherwise  
   #awk will take it as input file and then look for this file  
   #Normally it will complain "no such file or directory"  
   for(k = 1; k < ARGC; k++)  
   {  
     if(ARGV[k] == "-strip")  
     {  
       #If the argument is "-strip", then setup the global  
       #config variable "Strip"  
       ARGV[k]=""  
       Strip=1  
     }  
     else if(ARGV[k] == "-verbose")  
     {  
       #If the argument is "-verbose", then setup the global  
       #config variable "Verbose"  
       ARGV[k]=""  
       Verbose=1  
     }  
     else if(ARGV[k] ~ /^=/)  
     {  
       #If the argument starts with "=", then increase the  
       #NSuffixFiles and associative array SuffixFiles  
       NSuffixFiles++  
       SuffixFiles[substr(ARGV[k],2)]++  
       ARGV[k]=""  
     }  
     else if(ARGV[k] ~ /^[+]/)  
     {  
       #If the argument starts with "+", then increase the  
       #item in associative array DictionaryFiles  
       DictionaryFiles[substr(ARGV[k], 2)]++  
       ARGV[k]=""  
     }  
   }  
   
   #Remove trailing empty arguments(for nawk)  
   #For nawk, if there is empty arguments left in the end, it won't   
   #read value from standard input, so we need to decrease ARGC, until  
   #meeting with one non-empty argument  
   while ((ARGC > 0) && (ARGV[ARGC-1] == ""))  
     ARGC--  
 }  
   
 function load_dictionaries()  
 {  
   #Iterate each file in DictionaryFiles, for each  
   #file, read each line as a word, and save the word  
   #at associative array "Dictionary"  
   for(file in DictionaryFiles)  
   {  
     while((getline word < file) > 0)  
       Dictionary[tolower(word)]++  
     close(file)  
   }  
 }  
   
 function load_suffixes(   file, k, line, n, parts)  
 {  
   #If number of suffix files is larger than 0, then we iterate   
   #SuffixFiles, read each suffix rule line from each file  
   if(NSuffixFiles > 0)  
   {  
     for(file in SuffixFiles)  
     {  
       while((getline line < file) >0)  
       {  
         #For each "rule", strip comments, leading whitespace  
         #and trailing whitespace  
         sub(" *#.*$", "", line) # strip comments  
         sub("^[ \t]+", "", line) # strip leading whitespace  
         sub("[ \t]+$", "", line) # strip trailing whitespace  
         if(line == "")  
           continue  
   
         #Split each items in line, assign items into array parts  
         #Save first item (suffix) into array Suffixes  
         #Save remaining items(replacement) into array Replacement  
         n=split(line, parts)  
         Suffixes[parts[1]]++  
         Replacement[parts[1]]=parts[2]  
   
         for(k=3; k<=n; k++)  
           Replacement[parts[1]]=Replacement[parts[1]] " " parts[k]  
       }  
       close(file)  
     }  
   }  
   else  
   {  
     #If user doesn't specify the replacement file, setup default  
     #suffix rules  
     Suffixes["ed"]=1;  
     Suffixes["ing"]=1;  
     Replacement["ed"]="\"\" e"  
     Replacement["ing"]="\"\""  
   }  
 }  
   
 function order_suffixes(   i, j, key)  
 {  
   #Save all suffixes into array OrderedSuffix  
   NOrderedSuffix=0  
   for(key in Suffixes)  
     OrderedSuffix[++NOrderedSuffix] = key  
     
   #Sort the OrderedSuffix, make it be from long  
   #to short  
   for(i=1; i<NOrderedSuffix;i++)  
     for(j=i+1; j<=NOrderedSuffix;j++)  
       if(length(OrderedSuffix[i]) < length(OrderedSuffix[j]))  
         swap(OrderedSuffix, i, j)  
 }  
   
 function swap(a,i,j,  temp)  
 {  
   temp = a[i]  
   a[i] = a[j]  
   a[j] = temp  
 }  
   
 function spell_check_line(   k, word)  
 {  
   #For each record line, we replace the non word characters  
   #with white spaces  
   gsub(NonWordChars, " ")  
   
   #Iterate each record, strip leading and trailing apostrophies  
   #then call spell check method  
   for(k=1;k <= NF;k++)  
   {  
     word=$k  
     sub("^'+", "", word) #strip leading apostrophies  
     sub("'+$", "", word) #strip trailing apostrophies  
     if(word != "")  
       spell_check_word(word)  
   }  
 }  
   
 function spell_check_word(word,   key, lc_word, location, w, wordlist)  
 {  
   #Convert the input word to lowercase, and check in Dictionary   
   #associative arrays  
   lc_word=tolower(word)  
   if(lc_word in Dictionary)  
     return  
   else  
   {  
     #If not found in Dictionary associative arrays, then strip  
     #the suffix if user specified to do that, and check words  
     #after stripping suffixes in Dictionary  
     if(Strip)  
     {  
       strip_suffixes(lc_word, wordlist)  
       for(w in wordlist)  
         if(w in Dictionary)  
           return  
     }  
       
     #If the word still doesn't get found at Dictionary after stripping  
     #off the suffix, then we shall save the word into array Exception  
     location = Verbose ? (FILENAME ":" FNR ":") : ""  
     if(lc_word in Exception)  
       Exception[lc_word] = Exception[lc_word] "\n" location word  
     else  
       Exception[lc_word] = location word  
   }  
 }  
   
 function strip_suffixes(word, wordlist,    ending, k, n, regexp)  
 {  
   #wordlist array is used to save all words generated after stripping  
   #suffix. In the beginning, we use split to clear up the wordlist  
   split("", wordlist)  
   
   #Iterate each suffix. For each suffix, if it matches with the word  
   #in the end, we strip the end by using substr. RSTART is the number  
   #indicating from where regexp starts matching.  
   for(k=1; k <= NOrderedSuffix; k++)  
   {  
     regexp=OrderedSuffix[k]  
     if(match(word, regexp))  
     {  
       word=substr(word, 1, RSTART-1)  
   
       #We check the Replacement associative array, if there is no  
       #replacement, then we save the origianl word in "wordlist"  
       #otherwise, we split items in Replacement string, add each  
       #item to end-stripped word, generating new word, and save   
       #into the wordlist  
       if(Replacement[regexp] == "")  
         wordlist[word]=1  
       else  
       {  
         split(Replacement[regexp], ending)  
         for(n in ending)  
         {  
           if(ending[n] == "\"\"")  
             ending[n] = ""  
           wordlist[word ending[n]]=1  
         }  
       }  
   
       break  
     }  
   }  
 }  
   
 function report_exception()  
 {  
   for(key in Exception)  
     print Exception[key]  
 }  
   

awk program is compiled into compact internal representation and then interpreted at runtime by a virtual machine.
But its built-in function is implemented by underlying language, currently in C.

No comments:

Post a Comment