; Have a look at my implementation of Soundex in Scheme! You need ; the Spark Scheme interpreter to run this, as we use some Spark extensions. ; It can be run on other Schemes as well, by making minor modifications. ; This is how the program works: It reads a file of English words and builds ; a Soundex database. This is a one-time process. Then it enters a prompt where ; the user can type-in a word. If that word is not in the dictionary, a list of ; possible suggestions are given back to the user. Here is a sample session: ; > property ; > propritory ; (proprietary proprietor) ; There are three variables to control the behavior of the spell checker: ; 1. dict-file. The complete or relative path of the 'words' file. ; This should contain a list of words separated by '\r\n'. ; 2. match-range. This decides how close the suggestions should be to ; the typo. This defaults to "0.5". A lesser value will give more ; words to choose from and a higher value will give only those ; suggestions that closely match the wrong word. For instance, if you ; change this value to .3, the above given typo 'propritory' ; will give you the suggestions: '(proprietary proprietor preparatory)'. ; 3. show-all-suggestions. If this is true (#t) all possible ; suggestions are printed. match-range is ignored. ; Now, here is the complete source of our spell checker: ;; You can customize the program by changing the values of these three variables: (define dict-file "words") (define match-range .5) (define show-all-suggestions #f) ;; :~ ;; The table used to create soundex constants. (define consonents null) (define (init-consonents) (set! consonents (make-hash-table)) (let* ((c #(#\b #\f #\p #\v #\c #\g #\j #\k #\q #\s #\x #\z #\d #\t #\l #\m #\n #\r)) (d #(1 1 1 1 2 2 2 2 2 2 2 2 3 3 4 5 5 6)) (c-len (vector-length c)) (i 0)) (while (< i c-len) (hash-table-put! consonents (vector-ref c i) (vector-ref d i)) (set! i (add1 i))))) ;; Returns the soundex code of a word. ;; The optional argument decides whether to retrun ;; the full code or truncate it to 3 digits. (define (soundex word . args) (if (null? consonents) (init-consonents)) (set! word (string-downcase word)) (let ((len (string-length word)) (sdx (list)) (i 1) (prev-digit 0) (curr-digit 0) (cut #t)) (if (not (null? args)) (set! cut (car args))) (set! sdx (append sdx (list (string-ref word 0)))) (while (< i len) (set! curr-digit (hash-table-get consonents (string-ref word i) null)) (if (and (not (null? curr-digit)) (not (= curr-digit prev-digit))) (begin (set! sdx (append sdx (list curr-digit))) (set! prev-digit curr-digit))) (set! i (add1 i)) (if cut (begin (if (not (< (length sdx) 4)) (set! i len))))) (while (< (length sdx) 4) (set! sdx (append sdx (list 0)))) sdx)) ;; Reads the words file to memory. ;; Note: This implementation is not very efficient. (define (load-words) (let ((words (list)) (flen (file-size dict-file))) (call-with-input-file dict-file (lambda (f) (let ((line (read-line f 'return-linefeed))) (while (not (eof-object? line)) (if (symbol? line) (set! line (symbol->string line))) (if (string? line) (set! words (append words (list line)))) (set! line (read-line f 'return-linefeed)))))) words)) ;; Groups words to their soundex code using an association list. (define soundex-dict (list)) ;; The soundex database filename. It is created in the ;; current path by appending the extension ".sdx" to the ;; words file name. (define (get-soundex-file-name) (string-append (path->string (file-name-from-path dict-file)) ".sdx")) ;; Saves the soundex codes hash-map to the .sdx file. ;; Next time the program starts up fast as this ;; file is directly read into memory. If this file ;; exists, the the "words" file is ignored. (define (save-soundex-dict) (let ((file-name (get-soundex-file-name))) (call-with-output-file file-name (lambda (f) (write soundex-dict f))))) ;; Load the .sdx file. (define (load-soundex-dict) (let ((file-name (get-soundex-file-name))) (if (file-exists? file-name) (begin (call-with-input-file file-name (lambda (f) (set! soundex-dict (read f)))) #t) #f))) ;; Creates the soundex association list. (define (make-soundex-dict) (if (not (load-soundex-dict)) (begin (printf "Creating word patterns. This one time process might take a few minutes.~n") (flush-output) (let ((words (load-words)) (sdx null) (wlist null) (w null)) (while (not (null? words)) (set! w (car words)) (set! sdx (soundex w #f)) (set! wlist (assoc sdx soundex-dict)) (if (eq? wlist #f) (set! wlist (list w)) (begin (set! soundex-dict (remove soundex-dict (assoc sdx soundex-dict))) (set! wlist (cdr wlist)) (set! wlist (append wlist (list w))))) (set! soundex-dict (append soundex-dict (list (cons sdx wlist)))) (set! words (cdr words)))) (save-soundex-dict)))) ;; Returns the number of common characters ;; in the strings w1 and w2. (define (count-shared-chars w1 w2) (let* ((w1-len (string-length w1)) (w2-len (string-length w2)) (min-len (min w1-len w2-len)) (w null) (c 0) (wlst null) (count 0) (found (list))) (if (= w1-len min-len) (begin (set! w w1) (set! wlst (string->list w2))) (begin (set! w w2) (set! wlst (string->list w1)))) (for i in (range 0 min-len) (set! c (string-ref w i)) (if (= (find found c) -1) (begin (if (>= (find wlst c) 0) (begin (set! count (add1 count)) (set! found (append found (list c)))))))) count)) ;; Returns a list of words that are phonetically most ;; similar to "word". We use a simple distance calculation ;; algorithm to find these. (define (find-most-similar-words word words) (let ((a (string-length word)) (ret (list))) (while (not (null? words)) (let* ((c (count-shared-chars (car words) word)) (w (car words)) (b (string-length w)) (qs (/ (* 2 c) (+ a b)))) (if (>= qs match-range) (set! ret (append ret (list (cons w qs))))) (set! words (cdr words)))) (set! ret (sort ret (lambda (a b) (> (cdr a) (cdr b))))) ret)) (define (remove-qs lst) (let ((ret (list))) (while (not (null? lst)) (set! ret (append ret (list (car (car lst))))) (set! lst (cdr lst))) ret)) ;; Main program starts here. Create the soundex database if it is not there: (make-soundex-dict) (printf "Enter words to spellcheck. Terminate the session by typing QUIT.~n") (define word null) ;; Read words from the user, give spelling suggestions if he inputs a wrong word. ;; This loop can be terminated by typing "QUIT". (while #t (printf "> ") (flush-output) (set! word (read)) (if (not (eq? word 'QUIT)) (begin (let ((s (assoc (soundex (symbol->string word) #f) soundex-dict))) (if (eq? s #f) (printf "NO SUGGESTIONS~n") (begin (let ((words (cdr s))) (if (not (find-if words (lambda (i) (string=? i (symbol->string word))))) (begin (if show-all-suggestions (printf "~a~n" words) (printf "~a~n" (remove-qs (find-most-similar-words (symbol->string word) words))))))))))) (break)))