spectrumwordlistgen/make_spectrum_wordlist.sh

61 lines
1.2 KiB
Bash
Executable File

#!/usr/bin/zsh
OUTFILE_ROOT="$1";
COUNT0=$2;
COUNT1=$3;
#DICTFILE="lowercase-english";
DICTFILE="$4";
TMPFILE="$OUTFILE_ROOT.tmp.txt";
if [ -z "$OUTFILE_ROOT" ]; then
echo "Usage: $0 <outfile-root> <num_words0> <num_words1> <dictfile>";
exit 1;
fi;
if [ ! -f "$DICTFILE" ]; then
echo "Error: $DICTFILE not found";
exit 1;
fi;
if [ -z "$COUNT0" ]; then
COUNT0=100;
fi;
if [ -z "$COUNT1" ]; then
COUNT1=100;
fi;
rm -v $OUTFILE_ROOT.txt;
rm -v $TMPFILE;
rm -v $OUTFILE_ROOT-*;
echo "Generating $TMPFILE";
for i in `shuf -n $COUNT0 $DICTFILE`; do
for j in `shuf -n $COUNT1 $DICTFILE | grep -v $i`; do
echo $i$j;
done;
done > $TMPFILE;
echo "Generating $OUTFILE_ROOT.txt";
for i in `cat $TMPFILE`; do
for j in `seq 0 9`; do echo $i"00"$j; done;
for j in `seq 10 99`; do echo $i"0"$j; done;
for j in `seq 100 999`; do echo $i$j; done;
done > $OUTFILE_ROOT.txt
rm -v $TMPFILE;
echo "Shuffling $OUTFILE_ROOT.txt";
sort -u -R $OUTFILE_ROOT.txt -o $OUTFILE_ROOT.txt;
wc -l $OUTFILE_ROOT.txt;
echo "Splitting $OUTFILE_ROOT.txt";
split -l 100000 -d -a 3 $OUTFILE_ROOT.txt $OUTFILE_ROOT-;
echo "Done";
echo "Split files are $OUTFILE_ROOT-000 to $OUTFILE_ROOT-xxx";