freckles

#!/bin/sh # # $Id: findaffix.X,v 1.23 2015-02-08 00:35:41-08 geoff Exp $ # # Copyright 1992, 1993, 1999, 2001, 2005, Geoff Kuenning, Claremont, CA # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. All modifications to the source code must be clearly marked as # such. Binary redistributions based on modified source code # must be clearly marked as modified versions in the documentation # and/or other materials provided with the distribution. # 4. The code that causes the 'ispell -v' command to display a prominent # link to the official ispell Web site may not be removed. # 5. The name of Geoff Kuenning may not be used to endorse or promote # products derived from this software without specific prior # written permission. # # THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # Find possible affixes for use with ispell # # Usage: # # findaffix [-p | -s] [-f] [-c] [-m min] [-M max] [-e elim] [-l low] \ # [-t tabchar] [files] # # Each common prefix (-p) or suffix (-s, default) is presented, along # with statistics to indicate how useful such an affix might be in # reducing the size of the input file. Only those affixes which # produce a legal root (one found in the original input) are reported. # # If the "-c" option is not given, the output lines are in the # following format: # # strip/add/count/bytes # # where "strip" is the string that should be stripped from a root # word before adding the affix, "add" is the affix to be added, "count" # is a count of the number of times that this "strip/add" combination # appears, and "bytes" is an estimate of the number of bytes that # will be saved in the raw dictionary file if this combination is # added to the affix file. The field separator in the output will # normally be the tab character specified by the "-t" switch; the # default is a slash ("/"). # # If the "-c" ("clean output") option is given, the appearance of # the output is made cleaner by changing it to: # # -strip+add<tab>count<tab>bytes # # where "strip," "add," "count," and "bytes" are as before, and "<tab>" # represents the ASCII tab character. # # The method used to generate possible affixes will also generate # longer affixes which have common headers or trailers. For example, # the two words "moth" and "mother" will generate not only the obvious # substition "+er" but also "-h+her" and "-th+ther" (and possibly # even longer ones, depending on the value of "min"). To prevent # cluttering the output with such affixes, any affix pair that shares # a common header (or, for prefixes, trailer) string longer than # "elim" characters (default 1) will be suppressed. You may want to # set "elim" to a value greater than 1 if your language has string # characters; usually the need for this parameter will become obvious # when you examine the output of your findaffix run. # # Normally, the output is sorted on the "bytes" field. If the "-f" # flag is given, the output is sorted according to the "count" field. # # No affix longer than "max" characters (default 8) will be reported. # Smaller values of "max" will make the script run faster. # # Affixes which appear fewer than "low" times (default 10) are # suppressed. This significantly reduces the size of the output file. # # Affixes which generate stems shorter than "min" characters (default 3) # are suppressed. (A stem is the word after the "strip" string has # been removed, and before the "add" string has been added.) This # reduces both the running time and the size of the output file. "Min" # should only be set to 1 if you have a *lot* of free time and disk # space. # # The script requires a non-blank field-separator character for internal # use. Normally, this character is a slash ("/"), but if the slash # appears as a character in the input word list, a different character # can be specified with the "-t" switch. # # If the input files are ispell dictionaries, they should be expanded # before being fed to this script. # # If the input files contains characters other than [A-Za-z], they # should be translated to lowercase before being fed to this script. # # $Log: findaffix.X,v $ # Revision 1.23 2015-02-08 00:35:41-08 geoff # Be a bit more paranoid about creating temporary files. # # Revision 1.22 2005/04/27 01:18:34 geoff # Work around idiotic POSIX incompatibilities in sort. Add secure # temp-file handling. # # Revision 1.21 2005/04/14 14:39:33 geoff # Use /tmp as the default temp directory # # Revision 1.20 2005/04/14 14:38:23 geoff # Update license. Protect against modernized (i.e., incompatible) and # internationalized sort commands. # # Revision 1.19 2001/09/06 00:30:28 geoff # Many changes from Eli Zaretskii to support DJGPP compilation. # # Revision 1.18 2001/07/25 21:51:46 geoff # Minor license update. # # Revision 1.17 2001/07/23 20:24:03 geoff # Update the copyright and the license. # # Revision 1.16 1999/01/07 01:22:55 geoff # Update the copyright. # # Revision 1.15 1994/01/25 07:11:29 geoff # Get rid of all old RCS log lines in preparation for the 3.1 release. # # # In one of the most incredibly stupid decisions of all time, some # genius decided to break backwards compatibility by "deprecating" the # old-style sort switches even though it was trivial to recognize both # styles. The result is that that thousands of people (like me) will # have to rewrite shell scripts to tolerate that stupidity. (It's not # that the new syntax is bad--it's definitely easier to understand. # But that doesn't excuse breaking compatibility.) # CRETIN_SORT=true # # The following is necessary so that some internationalized versions of # sort(1) don't confuse things by sorting into a nonstandard order. # LANG=C LOCALE=C LC_ALL=C LC_COLLATE=C LC_CTYPE=C export LANG LOCALE LC_COLLATE LC_CTYPE # # The following aren't strictly necessary, but I've been made paranoid # by problems with the stuff above. It can't hurt to set them to a # sensible value. LC_MESSAGES=C LC_MONETARY=C LC_NUMERIC=C LC_TIME=C export LC_MESSAGES LC_MONETARY LC_NUMERIC LC_TIME TDIR=${TMPDIR-/tmp} TEMPDIR=`mktemp -d ${TDIR}/faffXXXXXXXXXX 2>/dev/null` || { echo "$0: Failed to create temporary directory; exiting..." 1>&2; exit 1; } TMP=${TEMPDIR}/faff. SORTTMP="-T ${TDIR}" # !!SORTTMP!! USAGE='Usage: findaffix [-p | -s] [-f] [-c] [-e elim] [-m min] [-M max] [-l low] [-t tabch] [files]' LOOP=' i = len - maxlim + 1 if (i < minstem + 1) i = minstem + 1 for ( ; i <= len; i++) print substr ($0, 1, i - 1) tabch substr ($0, i) tabch len print $0 tabch tabch len' ELIM='$1!=$2 \ { if (substr ($1, 1, elimlen) != substr ($2, 1, elimlen)) print }' maxlim=8 minstem=3 elimlen=1 lowcount=10 cleanout=no if $CRETIN_SORT then finalsortopts='-k 4rn,4 -k 3rn,3 -k 2,2 -k 1,1' else finalsortopts='+3rn -4 +2rn -3 +1 -2 +0 -1' fi tabch=/ while : do case "$1" in -p) LOOP=' lim = len - minstem if (lim > maxlim) lim = maxlim for (i = 1; i <= lim; i++) print substr ($0, i + 1) tabch substr ($0, 1, i) tabch len print $0 tabch tabch len' ELIM='$1!=$2 \ { if (substr ($1, length ($1), elimlen) \ != substr ($2, length ($2), elimlen)) print }' shift ;; -s) shift ;; -f) if $CRETIN_SORT then finalsortopts='-k 3rn,3 -k 4rn,4 -k 2,2 -k 1,1' else finalsortopts='+2rn -3 +3rn -4 +1 -2 +0 -1' fi shift ;; -c) cleanout=yes shift ;; -e) elimlen=$2 shift; shift ;; -m) minstem=$2 shift; shift ;; -M) maxlim=$2 shift; shift ;; -l) lowcount=$2 shift; shift ;; -t) tabch="$2" shift; shift ;; -*) echo "$USAGE" 1>&2 exit 1 ;; *) break ;; esac done trap "rm -rf $TEMPDIR; exit 1" 1 2 15 trap "rm -rf $TEMPDIR; exit 0" 13 # # We are ready to do the work. First, we collect all input, translate it # to lowercase, sort it (dropping duplications), and save it for later. # if [ $# -ne 0 ] then cat "$@" | tr '[A-Z]' '[a-z]' else tr '[A-Z]' '[a-z]' fi \ | sort -u $SORTTMP > ${TMP}a # # Now the monstrous pipeline. The awk command produces several lines for # each input word. Each line contains a possible stem (first field), # a possible affix, and the length of the original word. The loop which # does this was placed into the LOOP variable by the code above (q.v.). # # The first sort puts this output into an order appropriate for feeding # to 'join'. The join command then combines stems and affixes, and for # each puts out an affix to strip, an affix to add, and the length of # the word before and after modification. # # From here on out the job is relatively easy. The second 'awk' gets rid # of lines that have the same strip and add affixes, and also eliminates # lines where the strip and add affix have a common leading (for suffixes) # or trailing (for prefixes) substring, or where the strip affix is longer # than the add affix (this is all done by the $ELIM variable, which is also # set up by the code above. The second sort collects identical affixes; # the third 'awk' functions like 'uniq -c', replacing duplicate affixes # with a count and summing the estimate of bytes saved. It also eliminates # any affixes which appear less frequently than the minimum ("lowcount"). # Finally, the third sort ($finalsortopts) rearranges the list in the chosen # sort order. # if $CRETIN_SORT then sortopts1='-k 1,1 -k 2' sortopts2='-k 2,2 -k 1,1' else sortopts1='+0 -1 +1' sortopts2='+1 -2 +0 -1' fi awk "BEGIN{minstem=$minstem; maxlim=$maxlim; tabch="'"'"$tabch"'"} { len = length ($0) if (len < 2) next '"$LOOP"' }' < ${TMP}a \ | sort "-t$tabch" $sortopts1 $SORTTMP -o ${TMP}a join "-t$tabch" -o 1.2 2.2 2.3 ${TMP}a ${TMP}a \ | awk "-F$tabch" "BEGIN{elimlen=$elimlen}$ELIM" \ | sort "-t$tabch" $sortopts2 $SORTTMP \ | awk "-F$tabch" 'BEGIN{tabch="'"$tabch"'"; lowcount='"$lowcount"'} { if ($1 == last1 && $2 == last2) { count++ totchars += $3 } else { if ((last1 != "" || last2 != "") && count >= lowcount) print last1 tabch last2 tabch count tabch totchars count = 1 last1 = $1 last2 = $2 totchars = $3 } } END { if ((last1 != "" || last2 != "") && count >= lowcount) print last1 tabch last2 tabch count tabch totchars }' \ | sort "-t$tabch" $finalsortopts $SORTTMP \ | if [ "$cleanout" = "yes" ] then case "$tabch" in /) sedsub=/ sedsep=';' ;; .|\*|\[|\^|\$|\\) sedsub="\\$tabch" sedsep=/ ;; *) sedsub="$tabch" sedsep=/ ;; esac exec sed -e "s$sedsep$sedsub$sedsep ${sedsep}g" \ -e 's/ /+/' -e 's/^/-/' \ -e 's/^-+/+/' -e 's/+ / /' else exec cat fi rm -rf $TEMPDIR

freckles

Latest stories

What This Little Girl Can Do Is Beyond Your Imagination

Latest stories

Log In

Sign In

Forgot password?

Your password reset link appears to be invalid or expired.

Log in

Privacy Policy

Add to Collection

No Collections

Hey Friend!Before You Go…

Hey Friend!
Before You Go…