#! /usr/bin/env bash

# Convert table in html file $1 to csv.  Works on $1.html if $1 not found.  With no $1, uses standard input.
# Assumes there is just one table; if multiple, the contents will get combined.
# Assumes tags in lowercase.
# Output on standard output.

# $var:t    ${var##*/}
# $var:h    ${var%/*}
# $var:e    ${var##*.}
# $var:r    ${var%.*}

inputfile=$1

if [[ "$1" != "" ]]; then
  if [[ ! -f $inputfile ]]; then inputfile=$inputfile.html; fi
  if [[ ! -f $inputfile ]]; then echo
     "htmltable2csv: $inputfile not found"
     exit 2
  fi
fi
cat $inputfile \
   | sed -n '\+<table+,\+</table+p' \
   | sed 's+</tr>++' \
   | sed 's+</t[dh]>+,+g' \
   | sed 's+<[^>]*>++g' \
   | tr -d '\012\015' | tr '' '\012' \
   | sed 's+,$++' \
   | sed 's+&amp;+\&+g'

