#! /bin/dash
# repeats: Searches for duplicate files in the specified directories

# Copyright (C) 2004-2012 by Brian Lindholm.  This file is part of the
# littleutils utility set.
#
# The repeats utility is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later version.
#
# The repeats utility is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# the littleutils.  If not, see <http://www.gnu.org/licenses/>.

# get a valid temporary directory and set up traps
TMPWILD=`tempname -w repeats_$$` || exit 99
# trap 'rm -f ${TMPWILD} ; exit 1' 1 2 3 13 15
# trap 'rm -f ${TMPWILD} ; exit 0' 0

# get command-line options
ALGORITHM='4'
HARDLINKS='n'
MIDSIZE='4096'
PARANOID='n'
VERBOSE='n'
ZEROS='n'
while getopts a:hlm:pvz opts
do
  case $opts in
    a) ALGORITHM=${OPTARG} ;;
    h) echo 'repeats 1.0.30'
       echo 'usage: repeats [-a hash_algorithm] [-h(elp)] [-l(inks_hard)]'
       echo '         [-m(idsize) bytecount] [-p(aranoid)] [-v(erbose)] [-z(eros)]'
       echo '         [directory ...]'
       echo 'algorithms:  1 = MD5, 2 = SHA1, 3 = SHA224, 4 = SHA256 (default),'
       echo '             5 = SHA384, 6 = SHA512'
       exit 0 ;;
    l) HARDLINKS='y' ;;
    m) MIDSIZE=${OPTARG} ;;
    p) PARANOID='y' ;;
    v) VERBOSE='y' ;;
    z) ZEROS='y' ;;
    *) echo 'repeats 1.0.30'
       echo 'usage: repeats [-a hash_algorithm] [-h(elp)] [-l(inks_hard)]'
       echo '         [-m(idsize) bytecount] [-p(aranoid)] [-v(erbose)] [-z(eros)]'
       echo '         [directory ...]'
       echo 'algorithms:  1 = MD5, 2 = SHA1, 3 = SHA224, 4 = SHA256 (default),'
       echo '             5 = SHA384, 6 = SHA512'
       exit 1 ;;
  esac
done
shift `expr ${OPTIND} - 1`

# run through current directory
if [ $# -eq 0 ]; then

  TMPFILE1=`tempname repeats_$$_1` || exit 99
  if [ "$ZEROS" = 'n' ]; then
    find . -type f -print | sed -e 's/^\.\///' | filesize -p | \
      grep -v '	0$' | sort -t '	' -k2n,2n | \
      /usr/lib64/littleutils/rep_size > ${TMPFILE1}
  else
    find . -type f -print | sed -e 's/^\.\///' | filesize -p | \
      sort -t '	' -k2n,2n | \
      /usr/lib64/littleutils/rep_size > ${TMPFILE1}
  fi

# run through single specified directory
elif [ $# -eq 1 ]; then

  if [ -d "$1" -a -r "$1" -a -x "$1" ]; then

    TMPFILE1=`tempname repeats_$$_1` || exit 99
    if [ "$ZEROS" = 'n' ]; then
      find "$1" -type f -print | filesize -p | \
        grep -v '	0$' | sort -t '	' -k2n,2n | \
        /usr/lib64/littleutils/rep_size > ${TMPFILE1}
    else
      find "$1" -type f -print | filesize -p | \
        sort -t '	' -k2n,2n | \
        /usr/lib64/littleutils/rep_size > ${TMPFILE1}
    fi

  else
    echo "repeats error: $1 is not a readable directory"
    exit 1
  fi

# run through multiple specified directories
else

  TMPFILE0=`tempname repeats_$$_0` || exit 99
  while [ $# -gt 0 ]; do
    if [ -d "$1" -a -r "$1" -a -x "$1" ]; then
      find "$1" -type f -print >> ${TMPFILE0}
    else
      echo "repeats error: $1 is not a readable directory"
      rm -f ${TMPWILD}
      exit 1
    fi
    shift
  done

  TMPFILE1=`tempname repeats_$$_1` || exit 99
  if [ "$ZEROS" = 'n' ]; then
    sort -u ${TMPFILE0} | filesize -p | \
      grep -v '	0$' | sort -t '	' -k2n,2n | \
      /usr/lib64/littleutils/rep_size > ${TMPFILE1}
  else
    sort -u ${TMPFILE0} | filesize -p | \
      sort -t '	' -k2n,2n | \
      /usr/lib64/littleutils/rep_size > ${TMPFILE1}
  fi
fi

if [ "$VERBOSE" = 'y' ]; then
  # print to stderr the prelimary number of matches based on filesize
  echo "repeats message: num files with non-unique filesize = `wc -l ${TMPFILE1}`" | \
    sed -e "s: ${TMPFILE1}::" 1>&2
fi

# search for duplicates based on node numbers (eliminate hardlinks)
TMPFILE2=`tempname repeats_$$_2` || exit 99
if [ "$HARDLINKS" = 'n' ]; then
  sort ${TMPFILE1} | /usr/lib64/littleutils/rep_hard -p | \
    sort -t '	' -k4n,4n -k3n,3n -k2n,2n -k1,1 | \
    /usr/lib64/littleutils/rep_node > ${TMPFILE2}
  if [ "$VERBOSE" = 'y' ]; then
    # print to stderr the prelimary number of matches based on node numbers
    echo "repeats message: num files excluding hardlinks = `wc -l ${TMPFILE2}`" | \
      sed -e "s: ${TMPFILE2}::" 1>&2
  fi
else
  mv ${TMPFILE1} ${TMPFILE2}
fi

# search for duplicates based on a partial filehash
TMPFILE3=`tempname repeats_$$_3` || exit 99
sort ${TMPFILE2} | filehash -v -s -${ALGORITHM} -p -n ${MIDSIZE} | \
  sort -t '	' -k2n,2n -k3,3 -k1,1 | \
  /usr/lib64/littleutils/rep_hash > ${TMPFILE3}
if [ "$VERBOSE" = 'y' ]; then
  # print to stderr the prelimary number of matches based on partial filehash
  echo "repeats message: num pairs with matching partial hash = `wc -l ${TMPFILE3}`" | \
    sed -e "s: ${TMPFILE3}::" 1>&2
fi

# search for duplicates based on a complete filehash
TMPFILE4=`tempname repeats_$$_4` || exit 99
sed -e 's/\t/\n/' ${TMPFILE3} | sort -u | filehash -v -s -${ALGORITHM} -p | \
  sort -t '	' -k2n,2n -k3,3 -k1,1 | \
  /usr/lib64/littleutils/rep_hash > ${TMPFILE4}
if [ "$VERBOSE" = 'y' ]; then
  # print to stderr the prelimary number of matches based on complete filehash
  echo "repeats message: num pairs with matching complete hash = `wc -l ${TMPFILE4}`" | \
    sed -e "s: ${TMPFILE4}::" 1>&2
fi

# do final paranoia check if requested
if [ "$PARANOID" = 'n' ]; then
  # make it final: print results to stdout
  sort ${TMPFILE4}
else
  TMPFILE5=`tempname repeats_$$_5` || exit 99
  sort ${TMPFILE4} | /usr/lib64/littleutils/rep_cmp > ${TMPFILE5}
  if [ "$VERBOSE" = 'y' ]; then
    # print to stderr the final number of matches based on cmp results
    echo "repeats message: num pairs based on cmp results = `wc -l ${TMPFILE5}`" | \
      sed -e "s: ${TMPFILE5}::" 1>&2
  fi
  cat ${TMPFILE5}
fi

# clean up temp files
rm -f ${TMPWILD}
