!!POUNDBANG!! # # $Id: subset.X,v 1.22 2005/04/27 01:18:35 geoff Exp $ # # Copyright 1992, 1993, 1999, 2001, 2005, Geoff Kuenning, Claremont, CA # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. All modifications to the source code must be clearly marked as # such. Binary redistributions based on modified source code # must be clearly marked as modified versions in the documentation # and/or other materials provided with the distribution. # 4. The code that causes the 'ispell -v' command to display a prominent # link to the official ispell Web site may not be removed. # 5. The name of Geoff Kuenning may not be used to endorse or promote # products derived from this software without specific prior # written permission. # # THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # Combine and resolve various dictionaries so they are proper # subsets of one another, and so that maximal use is made of # flags in the smaller ones. # # Usage: # # subset [-b base] [-l langfile] small-dict bigger-dict ... biggest-dict # # The output is a an equal number of successively-larger # dictionaries. The smallest is written to "dict.0". Successive # files are named "dict.1", "dict.2", and so forth, and each contains # a list of words which should be added to the previous files to # generate a dictionary. Words which are in smaller dictionaries are # effectively propagated to the larger ones, so that the smaller ones # are proper subsets of their siblings. If dictionaries are # completely disjoint, this may result in an empty output dictionary. # Affix flags are propagated to the smallest dictionary containing # the root word; this expands the effectiveness of small dictionaries # at no cost in hash table space. # # The -b switch is used to specify a different base name for the # output files than "dict". (In other words, "-b english" would # produce output in english.0, english.1, etc.). # # If the -l switch is specified, the language tables are gotten # from the specified file; otherwise they come from $LIBDIR/!!DEFLANG!!. # # Input dictionaries should be "clean"; if non-word characters # appear in the dictionaries, the script may produce incorrect output. # # $Log: subset.X,v $ # Revision 1.22 2005/04/27 01:18:35 geoff # Work around idiotic POSIX incompatibilities in sort. Add secure # temp-file handling. # # Revision 1.21 2005/04/14 14:39:33 geoff # Use /tmp as the default temp directory # # Revision 1.20 2005/04/14 14:38:23 geoff # Update license. Protect against modernized (i.e., incompatible) and # internationalized sort commands. Fix a place where the affix table # wasn't passed to munchlist. # # Revision 1.19 2001/09/06 00:30:29 geoff # Many changes from Eli Zaretskii to support DJGPP compilation. # # Revision 1.18 2001/07/25 21:51:46 geoff # Minor license update. # # Revision 1.17 2001/07/23 20:24:04 geoff # Update the copyright and the license. # # Revision 1.16 1999/01/07 01:57:42 geoff # Update the copyright. # # Revision 1.15 1995/01/08 23:23:47 geoff # Support variable hashfile suffixes for DOS purposes. # # Revision 1.14 1994/01/25 07:12:10 geoff # Get rid of all old RCS log lines in preparation for the 3.1 release. # # # # The following is necessary so that some internationalized versions of # sort(1) don't confuse things by sorting into a nonstandard order. # LANG=C LOCALE=C LC_ALL=C LC_COLLATE=C LC_CTYPE=C export LANG LOCALE LC_COLLATE LC_CTYPE # # The following aren't strictly necessary, but I've been made paranoid # by problems with the stuff above. It can't hurt to set them to a # sensible value. LC_MESSAGES=C LC_MONETARY=C LC_NUMERIC=C LC_TIME=C export LC_MESSAGES LC_MONETARY LC_NUMERIC LC_TIME LIBDIR=!!LIBDIR!! TDIR=${TMPDIR-/tmp} TEMPDIR=`mktemp -d ${TDIR}/ssetXXXXXXXXXX 2>/dev/null` || TEMPDIR="$TDIR" TMP=${TEMPDIR}/sset$$ if [ "$TEMPDIR" = "$TDIR" ] then TOREMOVE="${TMP}*" else TOREMOVE="$TEMPDIR" fi SORTTMP="-T ${TDIR}" # !!SORTTMP!! USAGE="Usage: subset [-b base] [-l langfile] dict-0 dict-1 ..." langtabs=${LIBDIR}/!!DEFLANG!! outbase=dict while : do case "$1" in -b) outbase="$2" shift; shift ;; -l) langtabs="$2" shift; shift ;; -*) echo "$USAGE" 1>&2 exit 1 ;; *) break ;; esac done if [ $# -lt 2 ] then echo "$USAGE" 1>&2 exit 1 fi # Temp files MUNCHOUTPUT=${TMP}a MISSINGWORDS=${TMP}b TEMPDICT=${TMP}c FAKEDICT=${TMP}d FAKEHASH=${TMP}e!!HASHSUFFIX!! trap "rm -rf $TOREMOVE; exit 1" 1 2 15 trap "rm -rf $TOREMOVE; exit 0" 13 # # Create a dummy dictionary to hold a compiled copy of the language # tables. # echo 'QQQQQQQQ' > $FAKEDICT buildhash -s $FAKEDICT $langtabs $FAKEHASH \ || (echo "Couldn't create fake hash file" 1>&2; rm -rf $TOREMOVE; exit 1) \ || exit 1 rm -f ${FAKEDICT}* # # Figure out what the flag-marking character is. # flagmarker=`ispell -D -d $FAKEHASH \ | sed -n -e '/^flagmarker/s/flagmarker //p'` case "$flagmarker" in \\*) flagmarker=`expr "$flagmarker" : '.\(.\)'` ;; esac # # (1) Use munchlist to create a list of roots and maximal suffixes. # munchlist -l $langtabs "$@" | sort $SORTTMP > $MUNCHOUTPUT # # (2) Use join to add the maximal suffixes to each dictionary's roots. # Re-expand this, combine with the original, and save for later. # newline=' ' dictno=0 for dictfile do ispell -e -d $FAKEHASH < $dictfile | tr ' ' "$newline" \ | sort -u $SORTTMP | join "-t$flagmarker" -a1 - $MUNCHOUTPUT \ | ispell -e -d $FAKEHASH | tr ' ' "$newline" \ | sort -u $SORTTMP > ${TEMPDICT}.$dictno dictno=`expr $dictno + 1` done rm -f $MUNCHOUTPUT # # (3) For each adjacent pair of dictionaries, use comm to find words # in the smaller that are missing from the larger, and add them # to the larger. # firstdict="$1" shift lastdict="${TEMPDICT}.0" dictno=1 for dictfile do comm -23 $lastdict ${TEMPDICT}.$dictno > $MISSINGWORDS.$dictno if [ -s $MISSINGWORDS.$dictno ] then sort $SORTTMP -o ${TEMPDICT}.$dictno \ ${TEMPDICT}.$dictno $MISSINGWORDS.$dictno fi lastdict="${TEMPDICT}.$dictno" dictno=`expr $dictno + 1` done rm -f $MISSINGWORDS.* # # (4) For each pair of dictionaries, use comm to eliminate words in # the smaller from the larger, and shrink the result with munchlist. # From this point out, we ignore interrupts. # munchlist -l $langtabs ${TEMPDICT}.0 > $outbase.0 lastdict="${TEMPDICT}.0" dictno=1 trap "" 1 2 13 15 for dictfile do comm -13 $lastdict ${TEMPDICT}.$dictno \ | munchlist -l $langtabs > $outbase.$dictno rm -f $lastdict lastdict="${TEMPDICT}.$dictno" dictno=`expr $dictno + 1` done rm -rf $TOREMOVE