#!/usr/bin/perl -w use strict; $|=1; $SIG{__DIE__} = \&printstats; my ($opt_v, $opt_s, $opt_S, $opt_U,); use Getopt::Long qw(:config no_ignore_case bundling); GetOptions( 'v=i' => \$opt_v, # verbose 's=s' => \$opt_s, 'S=s' => \$opt_S, 'U' => \$opt_U, # UNLINK ); my $minsize = ( defined $opt_s ? &shiftsize($opt_s,"MINSIZE") : 0); # 128*1024 ); my $maxsize = ( defined $opt_S ? &shiftsize($opt_S,"MAXSIZE") : 0); # 0 ); my $hashbufsize = 10*1024*1024; my $prefbytes = 1024*1024; my $verbosity = ( $opt_v ? $opt_v : 2 ); my $reportsize = 850*1024*1024; use File::stat; my (%by_size, %by_inode,); sub scan_dir($){ my ($targ, $verb,) = @_; die "not a dir" unless -d $targ; printf("SCANNING: %s\n", $targ) if $verb>0; &stats("scan dir"); my ($dir, $sub,); opendir($dir, $targ) or die "cant open $targ: $!"; while ($sub = readdir($dir)) { next if $sub =~ /^\.{1,2}$/; my $fn = $targ."/".$sub; #printf "DBG: fn = %s\n", $fn; next if -l $fn; die "vanished $fn" unless -e $fn; if (-d $fn) { &scan_dir($fn, $verb-1); } elsif (-f $fn) { &scan_file($fn, $verb-1); } else { &stats("scan funny"); } } closedir($dir); } sub scan_file($) { my ($fn,) = @_; my $st = lstat($fn); &stats("scan file"); if ($minsize && $st->size < $minsize) { &stats("scan skipped tiny"); return; } if ($maxsize && $st->size > $maxsize) { &stats("scan skipped big"); return; } my $key = $st->dev.".".$st->ino; unless (defined $by_inode{$key}) { if ($reportsize && $st->size > $reportsize) { printf "HUGE: %s\n", $fn; } &stats("scan inode"); $by_inode{$key} = { st => $st, fn => [], }; my $size = $st->size; unless (defined $by_size{$size}) { &stats("scan sizes"); $by_size{$size} = []; } push @{$by_size{$size}}, $key; } else { &stats("scan saved inodes"); &stats("scan saved bytes", $st->size); } push @{$by_inode{$key}{fn}}, $fn; } sub shiftsize{ my ($val, $name,) = @_; die "bad size: $val\n" unless $val =~ /^(\d+(\.\d+)?)([kmgt]?)$/i; my $nuval = $1 * ({ "" => 2**0, k => 2**10, m => 2**20, g => 2**30, t => 2**40, }->{lc($3)}||1); printf "%s: '%s' -> %i byte\n", $name, $val, $nuval; return $nuval; } @ARGV=(".",) unless @ARGV; while (@ARGV) { my $targ = shift @ARGV; if ($targ eq "-l") { my $list = shift @ARGV; printf "LIST: %s\n", $list; open(LST, $list) or die "cant open $list: $!"; while () { chomp; unshift @ARGV, $_; } close(LST); } elsif (-d $targ) { $targ =~ s/\/$//g; &scan_dir($targ, $verbosity); } elsif (-f $targ) { &scan_file($targ); } else { die "whatis: $targ"; } } &printstats; # check/hash dupe sizes use Digest::MD5; my (%by_prefhash, %by_hash,); for my $size (sort {scalar(@{$by_size{$b}}) <=> scalar(@{$by_size{$a}})} keys %by_size) { if (scalar(@{$by_size{$size}}) < 2) { # free up ram delete $by_inode{$by_size{$size}[0]}; delete $by_size{$size}; &stats("size skipped singleton"); next; } printf "SIZE: %s bytes, %s targets\n", $size, scalar (@{$by_size{$size}}); for my $din (@{$by_size{$size}}) { if (scalar(@{$by_inode{$din}{fn}}) < $by_inode{$din}{st}->nlink) { printf "MISSINGLINKS: %i %s\n", $din, $by_inode{$din}{fn}[0]; delete $by_inode{$din}; &stats("prefhashed missinglinks"); next; } &stats("prefhashed files"); my $fn = $by_inode{$din}{fn}[0]; unless (-f $fn && open(INF, $fn)) { warn "cant open '$fn': $!"; next; } printf("PREFHASHING: %s\n", $fn) if $opt_v; my $ctx = Digest::MD5->new; my $buf; my $bytes = sysread(INF, $buf, $prefbytes); &stats("prefhashed bytes", $bytes); $ctx->add($buf); close(INF); my $digest = $ctx->hexdigest; my $stor = \%by_prefhash; if ($bytes < $prefbytes && $bytes == $size) { # already hashed the whole thing $stor = \%by_hash; } unless (defined $stor->{$digest}) { &stats("prefhashed unique"); $stor->{$digest} = []; } push @{$stor->{$digest}}, $din; } &hashfold(); } &printstats; # check/hash dupe sizes for my $prefhash (sort keys %by_prefhash) { if (scalar(@{$by_prefhash{$prefhash}}) < 2) { delete $by_inode{$by_prefhash{$prefhash}[0]}; delete $by_prefhash{$prefhash}; &stats("prefhash skipped singleton"); next; } for my $din (@{$by_prefhash{$prefhash}}) { &stats("hashed files"); my $fn = $by_inode{$din}{fn}[0]; unless (-f $fn && open(INF, $fn)) { warn "cant open '$fn': $!"; next; } printf("HASHING: %s\n", $fn) if $opt_v; my $ctx = Digest::MD5->new; my $buf; while (sysread(INF, $buf, $hashbufsize)) { &stats("hashed bytes", length($buf)); $ctx->add($buf); } close(INF); my $digest = $ctx->hexdigest; unless (defined $by_hash{$digest}) { &stats("hashed unique"); $by_hash{$digest} = []; } push @{$by_hash{$digest}}, $din; } &hashfold(); } &printstats; # fold by hash &hashfold(); &printstats; exit 0; sub hashfold { for my $hash (sort keys %by_hash) { if (scalar(@{$by_hash{$hash}}) < 2) { &stats("fold skipped singleton"); &stats("hashwasted bytes", $by_inode{$by_hash{$hash}[0]}{st}->size); delete $by_inode{$by_hash{$hash}[0]}; delete $by_hash{$hash}; next; } # find newest my ($ndin, $nmtime,); for my $din (@{$by_hash{$hash}}) { if ((!defined $ndin) || ($nmtime < $by_inode{$din}{st}->mtime)) { $ndin = $din; $nmtime = $by_inode{$din}{st}->mtime; } } for my $din (@{$by_hash{$hash}}) { next if $din eq $ndin; die "different dev for $din vs $ndin" unless $by_inode{$din}{st}->dev == $by_inode{$ndin}{st}->dev; die "same ino for $din vs $ndin" if $by_inode{$din}{st}->ino == $by_inode{$ndin}{st}->ino; my $src = $by_inode{$ndin}{fn}[0]; for my $dst (@{$by_inode{$din}{fn}}) { printf "FOLD: %s => %s\n", $dst, $src; unlink($dst) or die "cant unlink $dst"; unless ($opt_U) { link($src, $dst) or die "cant link $dst => $src: $!"; } &stats("fold liberated files"); } &stats("fold liberated bytes", $by_inode{$ndin}{st}->size); &stats("fold liberated inodes"); delete $by_inode{$din}; } delete $by_hash{$hash}; } #sanity check die "stuff left in by_hash" if %by_hash; } ### tooooolz my %stats; sub stats{ my ($key, $val,) = @_; $val = 1 unless defined $val; $stats{$key} += $val; } sub printstats{ for (sort keys %stats) { my ($key, $val,) = ($_, $stats{$_},); if ($key =~ / bytes$/) { if ($val > 2**30) { $val = sprintf "%.1fG", $val / 2**30; } elsif ($val > 2**20) { $val = sprintf "%.1fM", $val / 2**20; } elsif ($val > 2**10) { $val = sprintf "%.1fk", $val / 2**10; } } printf " %s: %s\n", $key, $val; } %stats=(); }