#!/usr/bin/perl -w # # $Id: dirinfo.pl,v 1.8 2003/01/13 05:28:42 jmates Exp $ # # Copyright (c) 2000-2002, Jeremy Mates. This script is free # software; you can redistribute it and/or modify it under the same # terms as Perl itself. # # Run perldoc(1) on this file for additional documentation. # ###################################################################### # # REQUIREMENTS require 5; use strict; ###################################################################### # # MODULES use Carp; # to die for use Getopt::Std; # command line option handling use File::Basename; # for getting extensions use File::Spec; # for platform-independant file path ops ###################################################################### # # VARIABLES my $VERSION; ($VERSION = '$Revision: 1.8 $ ') =~ s/[^0-9.]//g; # where everything is stored, options, array lookup table my (%d_info, %opts, %di); my $path; # where we are, so to speak # hash that holds array positions for various elements in the %d_info hash o' arrays # probably should rearrange the array positions $di{'cnt_d'} = 0; # sub-dir counter $di{'cnt_f'} = 1; # sub-file counter $di{'cnt_e'} = 2; # sub-file-with-extensions counter $di{'cnt_o'} = 3; # not directory or file counter (e.g. links, character devices) $di{'ssize'} = 4; # sum of file sizes in dir $di{'size'} = 5; # size of largest file in dir $di{'fsize'} = 6; # name of largest file in dir $di{'min'} = 7; # least recently modified file $di{'fmin'} = 8; # ... and it's path $di{'max'} = 9; # most recently modified file $di{'fmax'} = 10; # ... and it's path $di{'mean'} = 11; # mean modification date $di{'stdev'} = 12; # standard deviation of mod dates $di{'wtref'} = 13; # reference to array of wtimes for the directory $di{'szref'} = 14; # ref to array of file sizes in directory $di{'szmn'} = 15; # mean file size $di{'szstd'} = 16; # ... std deviation of file sizes # controls the default output format of the humanizing routine that # makes file sizes readable by a human. my %global_prefs = ( # include decimals in output? (e.g. 25.8 K vs. 26 K) 'decimal' => 1, # include .0 in decmail output? 'decimal_zero' => 1, # what to divide file sizes down by 'factor' => 1024, # percentage above which will be bumped up # (e.g. 999 bytes -> 1 K as within 5% of 1024) # set to undef to turn off 'fudge' => 0.95, # lengths above which decimals will not be included # for better readability 'max_human_length' => 2, # list of suffixes for human readable output 'suffix' => ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'], ); ###################################################################### # # MAIN # parse command-line options getopts('h?lmoas:p:z:', \%opts); # help blab if they want it if (exists $opts{'h'} or exists $opts{'?'}) { print <<"HELP"; Usage: $0 [options] dir1 dir2 .. dirN A recursive directory information gleaner. Options for version $VERSION: -h/-? Display this message -m Display results in ugly format (raw data using TSV) -o Display results in a pretty, human readable format (default) -a Include averages, std. dev info (requires more memory) -l Limit search to current directory only (a la CVS :) -s n Summarize at depth n (where n is 0 or higher) in file tree; default is 0, summarize at the level of each passed dir. -H Do not humanize the output file sizes. (default is to) -p expr Perl expression describing what directories to prune out -z expr As -p except describes what files to exclude from stats Run perldoc(1) on this script for more information. HELP exit; } # Directories to examine either on the command line or riding # in on an input stream chomp(@ARGV = ) unless @ARGV; # at what depth of recursion should we start adding sub directory # results to the "parent" directory instead of individually my $depth = 0; $depth = $opts{'s'} if exists $opts{'s'}; for $path (@ARGV) { parsedir($path, 0, $path); } # data we want should now be lumbering around in %d_info hash... # do the machine format if called for if (exists $opts{'m'}) { for (sort { $a cmp $b || length $a <=> length $b } keys %d_info) { print $_, "\t", "\n"; for (@{$d_info{$_}}) { unless (ref $_) { print $_, "\t"; } else { print join (",", @$_), "\t"; } } print "\n"; } } # do the human format? unless (exists $opts{'m'}) { local $^W = 0; # to hide 'Use of uninitialized value' errors for (sort { $a cmp $b || length $a <=> length $b } keys %d_info) { my $ext_cnt = $d_info{$_}->[$di{'cnt_e'}]; my $file_cnt = $d_info{$_}->[$di{'cnt_f'}]; my $dir_cnt = $d_info{$_}->[$di{'cnt_d'}]; my $other_cnt = $d_info{$_}->[$di{'cnt_o'}]; my $items_cnt = $file_cnt + $dir_cnt + $other_cnt; my $ext_pcnt; if ($file_cnt != 0 || $dir_cnt != 0 || $other_cnt != 0) { print $_, ":\n"; print ' Directories: ', $dir_cnt, "\n" unless $dir_cnt == 0; print ' Files: ', $file_cnt unless $file_cnt == 0; if ($file_cnt > 0) { $ext_pcnt = sprintf("%.1f", (($ext_cnt / $file_cnt) * 100)) . "%"; } else { $ext_pcnt = '0%'; } $ext_cnt = 0 unless defined $ext_cnt; print ' (', $ext_cnt, ' have extensions - ', $ext_pcnt, ")\n" unless $file_cnt == 0; print ' Other: ', $other_cnt, "\n" unless $other_cnt == 0; print ' Wasted: ', humanize($d_info{$_}->[$di{'ssize'}]), "\n" unless $d_info{$_}->[$di{'ssize'}] == 0; print ' Mean Size: ', humanize(sprintf("%.f", $d_info{$_}->[$di{'szmn'}])), "\n" unless $items_cnt < 2 || !defined $d_info{$_}->[$di{'szmn'}]; print ' Std.Dev. Sz.: ', humanize(sprintf("%.f", $d_info{$_}->[$di{'szstd'}])), "\n" unless $d_info{$_}->[$di{'szstd'}] == 0; print ' Largest File: ', humanize($d_info{$_}->[$di{'size'}]), " ", $d_info{$_}->[$di{'fsize'}], "\n" unless $d_info{$_}->[$di{'size'}] == 0; print ' Oldest File: ', get_date($d_info{$_}->[$di{'min'}]), " ", $d_info{$_}->[$di{'fmin'}], "\n"; print ' Newest File: ', get_date($d_info{$_}->[$di{'max'}]), " ", $d_info{$_}->[$di{'fmax'}], "\n" unless $d_info{$_}->[$di{'fmin'}] eq $d_info{$_}->[$di{'fmax'}]; print ' Mean Date: ', get_date($d_info{$_}->[$di{'mean'}]), "\n" unless $items_cnt < 2 || !defined $d_info{$_}->[$di{'mean'}]; print ' Deviation: ', get_time_diff($d_info{$_}->[$di{'stdev'}]), ' (', sprintf("%.f", $d_info{$_}->[$di{'stdev'}]), ")\n" unless $d_info{$_}->[$di{'stdev'}] == 0; } else { print $_, " (Empty Directory) \n"; } print "\n"; } } exit; ###################################################################### # # SUBROUTINES sub parsedir { my $dir = shift; # where we should be poking around my $level = shift; # to what depth we have recursed my $previous = shift; # whence we have come # my $item; # current thingy dealing with (for loop later) my $target; # d_info directory to target for new info # Statistical Stuff # my @wtimes; # temp array for time-last-modified of each file in a folder # my($min_time, $min_time_file); # my($max_time, $max_time_file); # my($max_size, $max_size_file); # warn ' ' x $level, $dir, "\n"; # figure out what part of %d_info to shove our info into based on # depth limit if ($level > $depth) { $target = $previous; } else { $target = $dir; } # warn "$dir -> $target (level: $level; depth: $depth)\n"; #DBG opendir DIR, $dir or warn 'Could not open ', $dir, ' (', $!, ")\n"; DIRITEM: for (readdir DIR) { my $pti = File::Spec->catfile($dir, $_); next DIRITEM if m/^\.{1,2}$/; # prune out . and .. "files" # (otherwise, script will happily "descend" into that '.' dir!) # warn "Dealing with $pti\n"; #DBG if (-d $pti) { local $^W = 0; # hide 'Use of uninitialized value' warnings # see whether this dir needs to be pruned from the search if (exists $opts{'p'}) { my $results = eval "return 1 if( " . $opts{'p'} . " );"; if ($@) { chomp($@); die "Prune error: ", $@; # croak on errors } if ($results) { # warn "Pruned $pti\n"; next DIRITEM; } } # (optionally) recuse on downwards unless (-l $pti) { parsedir($pti, $level + 1, $target) unless exists $opts{'l'}; } else { # directory links evil. (well, according to this script) warn "Directory link $pti skipped to avoid possible infinite loop" if exists $opts{'v'}; $d_info{$target}->[$di{'cnt_o'}]++; } # collect some stats on this directory... my $dmtime = (stat(_))[9]; if (($dmtime < $d_info{$target}->[$di{'min'}]) || (!defined $d_info{$target}->[$di{'min'}])) { # warn "Oldest changed to $pti $dmtime vs. $d_info{$target}->[$di{'min'}]\n"; #DBG $d_info{$target}->[$di{'min'}] = $dmtime; $d_info{$target}->[$di{'fmin'}] = $pti; } if ($dmtime > $d_info{$target}->[$di{'max'}]) { $d_info{$target}->[$di{'max'}] = $dmtime; $d_info{$target}->[$di{'fmax'}] = $pti; } # add last write time to array push @{$d_info{$target}->[$di{'wtref'}]}, $dmtime if exists $opts{'a'}; # increment dir counter $d_info{$target}->[$di{'cnt_d'}]++; } elsif (-f $pti) { local $^W = 0; # hide 'Use of uninitialized value' warnings # see whether this file is to be skipped if (exists $opts{'z'}) { my $results = eval "return 1 if( " . $opts{'z'} . " );"; if ($@) { chomp($@); die "Skip error: ", $@; # croak on errors } if ($results) { # warn "Pruned $pti\n"; next DIRITEM; } } my ($fsize, $fmtime) = (stat(_))[7, 9]; # do stuff with the file size attribute if ($fsize > $d_info{$target}->[$di{'size'}]) { $d_info{$target}->[$di{'size'}] = $fsize; $d_info{$target}->[$di{'fsize'}] = $pti; } push @{$d_info{$target}->[$di{'szref'}]}, $fsize if exists $opts{'a'}; $d_info{$target}->[$di{'ssize'}] += $fsize; # do stuff with modification time attribute if (($fmtime < $d_info{$target}->[$di{'min'}]) || (!defined $d_info{$target}->[$di{'min'}])) { # warn "Oldest changed to $pti $fmtime vs. $d_info{$target}->[$di{'min'}]\n"; #DBG $d_info{$target}->[$di{'min'}] = $fmtime; $d_info{$target}->[$di{'fmin'}] = $pti; } if ($fmtime > $d_info{$target}->[$di{'max'}]) { $d_info{$target}->[$di{'max'}] = $fmtime; $d_info{$target}->[$di{'fmax'}] = $pti; } push @{$d_info{$target}->[$di{'wtref'}]}, $fmtime if exists $opts{'a'}; # and keep a count going... $d_info{$target}->[$di{'cnt_f'}]++; $d_info{$target}->[$di{'cnt_e'}]++ if get_ext($pti); # print ' ' x ($level +1), $pti, " $fmtime\n"; } else { local $^W = 0; # hide 'Use of uninitialized value' warnings $d_info{$target}->[$di{'cnt_o'}]++; # links don't return mod times, so exclude them from stats! } } closedir DIR; # now, gain some statistical info from the wtimes array # but only if we're about to drop out of a directory # we've been targeting due to depth summarization if ($target eq $dir) { # any mod times worth mentioning? if ($d_info{$target}->[$di{'wtref'}]) { $d_info{$target}->[$di{'mean'}] = mean($d_info{$target}->[$di{'wtref'}]); # warn "Got mean of $d_info{$target}->[$di{'mean'}] for $target\n"; # DBG # warn join("\n$.", sort { $a <=> $b } (@{$d_info{$target}->[$di{'wtref'}]})), "\n" if $target =~ m/p1$/; #DBG $d_info{$target}->[$di{'stdev'}] = standard_deviation( $d_info{$target}->[$di{'wtref'}], $d_info{$target}->[$di{'mean'}] ) if $d_info{$target}->[$di{'mean'}]; # warn "Got deviation of $d_info{$target}->[$di{'stdev'}]\n"; #DBG } # any file sizes? if ($d_info{$target}->[$di{'szref'}]) { $d_info{$target}->[$di{'szmn'}] = mean($d_info{$target}->[$di{'szref'}]); $d_info{$target}->[$di{'szstd'}] = standard_deviation( $d_info{$target}->[$di{'szref'}], $d_info{$target}->[$di{'szmn'}] ) if $d_info{$target}->[$di{'szmn'}]; } } } # Uses the standard File::Basename module to return the extension # of a passed file path, if any. Optionally, you can specify the # regex to use to match the extension as a second argument. Pass # '\..*' to match stuff like ".tar.gz" (default is ".gz" only). sub get_ext { my $ext_path = shift; # (optional) pass '\..*' to match stuff like '.tar.gz' # my regex here matches ".gz" (but not dot files without # additional dot's :) my $ext_pattern = shift || '.\.[^.]+'; return (fileparse($ext_path, $ext_pattern))[2]; } # $mean = mean(\@array) computes the mean of an array of numbers. # sub mean { my ($arrayref) = shift; my $result; for (@$arrayref) { $result += $_ } return $result / @$arrayref; } # $sd = standard_deviation_data(\@array) computes the standard # deviation of an array of numbers. # sub standard_deviation { my $arrayref = shift; my $mean = shift; # JAM there is a faster (6%) algorithm in Mastering Algorithms # with perl, but it chokes on certain directories where the # epoch times are all within a second or two of one another return sqrt(mean([map (($_ - $mean)**2, @$arrayref)])); } # simple routine that puts commas in passed numbers sub commify { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text; } # returns a custom date tag for the human-readable format sub get_date { my $thingy = shift; my ($sec, $min, $hour, $mday, $mon, $year, $wday) = localtime($thingy); my $month = (qw:Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec:)[$mon]; my $dayofweek = (qw:Sun Mon Tue Wed Thu Fri Sat:)[$wday]; $mon++; $year += 1900; return sprintf("%04d $month %02d %02d:%02d:%02d $dayofweek", $year, $mday, $hour, $min, $sec); } # retuns a delta time value (for the standard deviation) sub get_time_diff { my $difference = shift; $difference = int($difference); my $seconds = $difference % 60; $difference = ($difference - $seconds) / 60; my $minutes = $difference % 60; $difference = ($difference - $minutes) / 60; my $hours = $difference % 24; $difference = ($difference - $hours) / 24; my $days = $difference % 7; my $weeks = ($difference - $days) / 7; # probably a better way to do this! my $temp = ($weeks) ? "$weeks weeks, " : ''; $temp .= ($days) ? "$days days, " : ''; return $temp . sprintf("%02d:%02d:%02d", $hours, $minutes, $seconds); } # Inspired from GNU's df -h output, which fixes 133456345 bytes # to be something human readable. # # takes a number, returns formatted string. Also takes optional # hash containing various defaults that affect output style. sub humanize { my $num = shift; # could also take a array ref or hash ref to parse thru? return $num if exists $opts{'H'}; # lazy hack. don't do this. my %prefs = @_; # inherit global prefs, but give preference to user supplied ones unless (keys %prefs) { %prefs = %global_prefs; } else { # benchmarking w/ 5.6.0 on Linux PPC & i386 showed this next # faster than direct merge method (p. 145 Perl Cookbook) while (my ($k, $v) = each(%global_prefs)) { $prefs{$k} = $v unless exists $prefs{$k}; } } # some local working variables my $count = 0; my $prefix = ''; my $tmp = ''; # handle negatives if ($num < 0) { $num = abs $num; $prefix = '-'; } # reduce number to something readable by factor specified while ($num > $prefs{'factor'}) { $num /= $prefs{'factor'}; $count++; } # optionally fudge "near" values up to next higher level if (defined $prefs{'fudge'}) { if ($num > ($prefs{'fudge'} * $prefs{'factor'})) { $count++; $num /= $prefs{'factor'}; } } # no .[1-9] decimal on longer numbers for easier reading # only show decimal if prefs say so if (length sprintf("%.f", $num) > $prefs{'max_human_length'} || !$prefs{'decimal'}) { $tmp = sprintf("%.0f", $num); } else { $tmp = sprintf("%.1f", $num); # optionally hack trailing .0 as is not needed $tmp =~ s/\.0$// unless $prefs{'decimal_zero'}; } # return number with proper style applied return $prefix . $tmp . $prefs{'suffix'}->[$count]; } ###################################################################### # # DOCUMENTATION =head1 NAME dirinfo.pl - summarizes directory information. =head1 SYNOPSIS A summary of file and directory information for your home directory: $ dirinfo.pl ~/ A one level-deep report for the root filesystem, skipping over nfs: # dirinfo.pl -p 'm!^/nfs!' -s 1 / =head1 DESCRIPTION This script summarizes and prints out various information about a directory, including space consumed by the directory tree, largest file, oldest file, etc. It's a bit slow, and the code is horrible as I wrote it a while back. :) =head1 USAGE $ dirinfo.pl [options] dir1 [dir2 .. dirN] Directories will be read from STDIN if they are ommited from the command line. See L<"OPTIONS"> for details on the command line switches. =head1 OPTIONS The following command line options are available: =over 4 =item B<-h/-?> Display a brief little help blarb. =item B<-v> Become a bit more chatty about the whole process. (Currently only turns on annoying link-skipping warnings. :) =item B<-m> Display results in ugly format (raw data using TSV). =item B<-o> Display results in a quasi-human readable format (default). =item B<-a> Include averages, std. dev info. =item B<-l> Limit search to current directory only. =item B<-s n> Summarize at depth n (where n is 0 or higher) in file tree; default is 0, summarize at the level of each passed dir. =item B<-H> Do not humanize the output file sizes. Default is to convert ungainly numbers such as 406502769 to human readable numbers such as 388M. =item B<-p expr> Perl expression that will result in the current directory (stored in $_) being pruned out of the tree. Use this to skip "dot directories," for example: -p 'm/^\../' =item B<-z expr> As -p except describes what files to exclude from stats. =back =head1 EXAMPLES None yet. =head1 BUGS =head2 Reporting Bugs Newer versions of this script may be available from: http://sial.org/code/perl/ If the bug is in the latest version, send a report to the author. Patches that fix problems or add new features are welcome. =head2 Known Issues No known bugs. =head1 TODO Rewrite chicken scratch code in a better format. :) =head1 SEE ALSO perl(1) =head1 AUTHOR Jeremy Mates, http://sial.org/contact/ =head1 COPYRIGHT Copyright (c) 2000-2002, Jeremy Mates. This script is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 VERSION $Id: dirinfo.pl,v 1.8 2003/01/13 05:28:42 jmates Exp $ =cut