#!/bin/sh # bigshuf - developed by acidvegas (https://git.acid.vegas/random) # shuffles the lines in large files, randomizing the order while using a memory-safe approach # Check if enough arguments are provided if [ "$#" -lt 3 ]; then echo "Usage: $0 inputfile tempdir outputfile [lines per chunk]" >&2 exit 1 fi # Parse input arguments inputfile="$1" tempdir="$2" outputfile="$3" lines_per_chunk="${4:-10000}" # Check if input file exists if [ ! -f "$inputfile" ]; then echo "Error: Input file does not exist" >&2 exit 1 fi # Calculate required and available space required_space=$(( $(wc -c < "$inputfile") * 2 )) available_space=$(df --block-size=1K --output=avail "$tempdir" | tail -n 1) # Check if there is enough disk space in tempdir if [ "$available_space" -lt "$required_space" ]; then echo "Error: Not enough disk space in $tempdir" >&2 exit 1 fi # Check if tempdir is writable if [ ! -d "$tempdir" ]; then mkdir -p "$tempdir" || { echo "Error: Unable to create temp directory" >&2; exit 1; } elif [ ! -w "$tempdir" ]; then echo "Error: Temp directory is not writable" >&2 exit 1 fi # Split the file by lines split -l "$lines_per_chunk" "$inputfile" "$tempdir/chunk_" || { echo "Error: Failed to split file" >&2; rm -rf "$tempdir"; exit 1; } # Create a file with a shuffled list of chunk files find "$tempdir" -name 'chunk_*' | shuf > "$tempdir/chunks_list.txt" || { echo "Error: Failed to create shuffled chunks list" >&2; rm -rf "$tempdir"; exit 1; } # Shuffle each chunk based on the shuffled list and append to the output file while read -r chunk; do shuf "$chunk" >> "$outputfile" || { echo "Error: Failed to shuffle and append chunk $chunk" >&2; break; } done < "$tempdir/chunks_list.txt" # Clean up rm -rf "$tempdir"