#!/bin/csh -f

# Copies each file or standard input to standard output with punctuation removed as below:

# Words are broken at * (due to detex behavior), -, _, /, ~, blank, and tab.

# Then any single capital letter followed by a period is removed.
# Also any lone ampersand is removed.

# Then terminal punctuation
# (" or '  or ) or ] or } or . or ! or ? or , or ; or :)
# is removed repeatedly.

# Then initial punctuation
# (" or `  or ( or [ or {)
# is removed repeatedly.

# Finally numbers and numeric words (numbers prepended by ', #, or $
# or numbers with % appended or ordinals) are also removed.  (Note
# that +, ., (, and ) may appear in numbers due to ordinary number and
# phone number syntaxes; - is already a word break.)  Numbers
# including a colon that could represent a time on a 12 or 24 hour
# clock are also removed

#  The rest are kept even though they may be strange "words", e.g.,
# "0pen", since the emphasis here is on finding words for spell
# checking

cat $* \
  | egrep -v '^([A-Z]\.|&)$' \
  | sed s+\[\]\"\'\)\}\.\!\?\,\;\:\]\*\$++ \
  | sed s+\^\[\"\`\(\{\[\]\*++ \
  | grep -v \^\[\'\#\$\]\\\{0,1\\\}'[0-9.+()]*$' \
  | grep -v '^[0-9.+()]*%\{0,1\}$' \
  | egrep -v '^([0-1]?[0-9]|2[0-3]):[0-5][0-9](:[0-5][0-9])?$' \
  | egrep -v '^[0-9]*([0456789]th|11th|12th|13th|[023456789]?(1st|2nd|3rd))$'
