#! /bin/bash
#
# $Id: scramble,v 1.2 2002/12/08 22:40:51 relson Exp $ #
#
#  scramble -- split up files into entries assuming each entry
#                starts with a given separator
#              optionally, classify the entries
#              produce, on stdout, a random list of entries and
#                their locations and classes, or a stream of
#                entries in random order, with classifiers if given
#  needs:    bash basename rm grep awk wc perl dd
#  usage:    see function usage() starting on line 14 of this file
#  version:  0.12 (Greg Louis <glouis@dynamicro.on.ca>)

pid=$$

function usage() {
    iam=`basename $0`
    echo "Usage: $iam separator [-l] [-c classID] filename [...]"
    echo "       Files contain entries, each of which begins with"
    echo "       text matching the separator.  Entries are listed or"
    echo "       output in random order."
    echo "Parameters:"
    echo "       separator is a regex used by grep that matches the"
    echo "       start of each entry."
    echo "       -l indicates that the output is to be a list of"
    echo "       entries.  If this option is not given, the output"
    echo "       consists of the entries themselves."
    echo "       classID is text used to identify the class of a"
    echo "       given entry.  If no classID values are specified,"
    echo "       this field consists of a single . character.  If"
    echo "       classID values are given and the -l option is not"
    echo "       used, each entry in the output stream is preceded"
    echo "       by a single line of the form %%-CLASS-ClassID-%%."
    echo "       No one file may contain entries of more than one"
    echo "       class."
    echo "       Filenames may not contain blanks."
    rm -f list.$pid
    exit 1
}

# the first param is the separator
test "x$1" = "x" && usage
sep=$1
shift

stream=1
classID="."

# get all the byte offsets in all the files, in one list
while [ ${#*} -gt 0 ]; do
    if [ "x$1" = "x-l" ]; then
	stream=0
	shift
	continue
    fi
    if [ "x$1" = "x-c" ]; then
	classID=$2
	shift 2
	continue
    fi
    file=$1 ; shift
    if [ ! -r $file ]; then echo "$file not found"; usage; fi
    grep -a -b '^From ' $file | \
	awk "BEGIN {FS=\":\"} {print \"$classID $file \"\$1}" >>list.$pid
    wc -c $file | awk "{print \"$classID $file \"\$1}" >>list.$pid
done

# create a shuffled list, with lengths
# read a line; if it's not a new file, write a line
file=""
{
    while read classID fnam offset; do
	if [ "x$fnam" = "x$file" ]; then
	    let length=$offset-$oldoff
	    echo "$classID $fnam $oldoff $length"
	    oldoff=$offset
	else
	    file=$fnam
	    oldoff=0
	fi
    done
} <list.$pid | perl \
-e' srand ( time() ^ ($$ + ($$ << 15)) );' \
-e' foreach $key (<>) {' \
-e'     $shuf{$key} = rand;' \
-e' }' \
-e' foreach $key (sort { $shuf{$b} <=> $shuf{$a} } keys %shuf ) {' \
-e'     print $key;' \
-e' }' >shuf.$pid

if [ $stream = 0 ]; then
    {
	while read classID fnam offset length; do
	    if [ "$classID" = "." ]; then
		echo "$fnam $offset $length"
	    else
		echo "$classID $fnam $offset $length"
	    fi
	done
    } <shuf.$pid
else
    # go through the list, extract the messages, mark the classes
    {
	while read classID fnam offset length; do
	    if [ "$classID" != "." ]; then
	        echo "%%-CLASS-$classID-%%"
	    fi
	    dd if=$fnam bs=1 skip=$offset count=$length 2>/dev/null
	done
    } <shuf.$pid
fi
# next line can be commented out for debugging
rm list.$pid shuf.$pid
