#!/usr/bin/perl -w # # $Id: diskusage,v 1.16 2006/06/01 22:15:51 jmates Exp $ # # The author disclaims all copyrights and releases this script into the # public domain. # # 'du' output parser and reporter. # # Run perldoc(1) on this script for additional documentation. # # TODO stat() files so can include other information (whether file or # dir) in output? # # TODO support rsync-style exclude files, so can check totals as rsync # would see a directory tree? use strict; # KLUGE humanize on file size assumes -k set, split for exec-as-list # will break quoted "-o 'foo bar'" options (minimal risk) my @du_cmd = split /\s+/, $ENV{DU_CMD} || 'du -k'; # regex to parse output of du with my $du_re = qr/^(\d+) \s+ (\S+)/ox; use Getopt::Std; my %opts; getopts 'h?xc:o:d:', \%opts; print_help() if exists $opts{h} or exists $opts{'?'}; print_help() if not exists $opts{c} and not @ARGV; # KLUGE pass through on -x "one filesystem" option to du if ( exists $opts{x} ) { push @du_cmd, '-x'; } my $output_tmpl = '%{size} %{path} %{humansize}\n'; $output_tmpl = $opts{o} if exists $opts{o}; # fix backslashed characters to literal, add newline if no trailing # whitespace found $output_tmpl =~ s/(\\.)/qq!"$1"!/eeg; $output_tmpl .= "\n" unless $output_tmpl =~ m/ \s$ /x; # split space-delimited key=value pairs. Backslash the space to include # a literal space my %defaults; if ( exists $opts{d} ) { my @pairs = split /(? $opts{c}, errno => $! } ) and exit 104; push @directories, load_dir_prefs( \*PREFS ); } if (@ARGV) { # read from standard input if - last argument if ( $ARGV[-1] eq '-' ) { pop @ARGV; push @directories, load_dir_prefs( \*STDIN ); } # KLUGE default dirs on command line to do 'dir *' type searches for (@ARGV) { my @wanted = ( { subject => 'subpath', value => '^$', op => 're' }, { subject => 'subpath', value => '^/[^/]+/?$', op => 're' } ); push @directories, { parentdir => $_, wanted => \@wanted }; } } for my $entry (@directories) { rundu( $entry->{parentdir}, $entry->{wanted} ); } exit; sub load_dir_prefs { my $fh = shift; my @dirs; while (<$fh>) { next if /^\s*$/; s/^\s+//; next if /^#/; chomp; s/\s+$//; # extend lines ending with \ if (s/ \\ $ //x) { $_ .= ; redo unless eof; } my @tokens; UBLE: { # unquoted text push( @tokens, $1 ), redo UBLE if m/ \G ( [^"'\s]+ ) \s* /cgx; # single or double quoted, allowing for backslashed internal escapes push( @tokens, $2 ), redo UBLE if m/ \G (['"]) ((?: \\.|[^\\\1] )+) \1 \s* /cgx; last UBLE if / \G $ /gcx; } if (@tokens) { # unescape things like "\ " or "\n" @tokens = map { s/(\\.)/qq!"$1"!/eeg; $_ } @tokens; my $dir = shift @tokens; # TODO only subpath matching for output right now, could add more my @wanted; # default to printing for the parentdir, which has no subpath push @wanted, { subject => 'subpath', value => '^$', op => 're' }; for (@tokens) { # convert regex-free configuration entries to regex for cleaner # configuration file format unless ( m,\$$, or m,^\^, ) { $_ = '^/' . $_ . '$'; } s, \* ,[^/]+/?,gx; push @wanted, { subject => 'subpath', value => $_, op => 're' }; } push @dirs, { parentdir => $dir, wanted => \@wanted }; } } return @dirs; } sub rundu { my $parentdir = shift; my $wanted = shift; # trailing slashes lead to // in output, so evict, but leave leading # slash (for / root) alone $parentdir =~ s, (? $parentdir } ); } open FROMDU, '-|' or exec @du_cmd, $targetdir or remark( 'error', 'du failed', { errno => $!, dir => $parentdir } ) and exit 102; my %dudefaults = ( %defaults, parentdir => $parentdir, epoch => time ); while () { chomp; # gather output into a hash for easy access of data my %duinfo = (%dudefaults); unless ( ( $duinfo{size}, $duinfo{path} ) = m/$du_re/ ) { # this should never be hit, unless output format of du changes remark( 'error', 'invalid output from du', { line => $., dir => $parentdir } ); exit 103; } # KLUGE fix reporting from du to be that of requested directory, not # symlink target if ( $targetdir ne $parentdir ) { $duinfo{path} =~ s/^$targetdir/$parentdir/; } # determine path excluding leading root $duinfo{subpath} = q{}; my $offset = index $duinfo{path}, $parentdir; if ( $offset > -1 ) { $duinfo{subpath} = substr $duinfo{path}, $offset + length $parentdir; } # KLUGE this assumes 'du -k' is being used! $duinfo{humansize} = humanize( $duinfo{size}, { base => 1024 } ) if $output_tmpl =~ m/%{humansize}/; # DBG to see what template values are available #use Data::Dumper; warn Dumper \%duinfo; for my $entry (@$wanted) { if ( $duinfo{ $entry->{subject} } =~ m/$entry->{value}/ ) { my $str_out = $output_tmpl; $str_out =~ s/ %{ (\w+) } / defined $duinfo{$1} ? $duinfo{$1} : q{} /egx; print $str_out; last; } } } close FROMDU or remark( 'warning', 'problem closing pipe from du', { errno => $!, dir => $parentdir } ); } # Inspired from GNU's df -h output, which fixes 133456345 bytes # to be something human readable. # # takes a number, returns formatted string. Optionally accepts # hash reference containing non-standard defaults. sub humanize { my $num = shift; my $prefs = shift; # error checking on input... return $num unless $num =~ m/^-?\d+$/; # various parameters that adjust how the humanization is done # these really should be able to be specified on the command line, or # read in from a prefs file somewhere, as nobody will agree as to what # "proper" human output should look like... :) my %defaults = ( # base numbers are in (1 is bytes, 1024 for K) 'base' => 1, # include decimals in output? (e.g. 25.8K vs. 26K) 'decimal' => 1, # include .0 in decmail output? 'decimal_zero' => 1, # what to divide file sizes down by # 1024 is generally "Kilobytes," while 1000 is # "kilobytes," technically 'factor' => 1024, # percentage above which will be bumped up # (e.g. 999 bytes -> 1 K as within 5% of 1024) # set to undef to turn off 'fudge' => 0.96, # lengths above which decimals will not be included # for better readability 'max_human_length' => 2, # list of suffixes for human readable output 'suffix' => [ 'B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y' ], ); # merge passed options with defaults while ( my ( $k, $v ) = each %$prefs ) { if ( exists $defaults{$k} and $v ) { $defaults{$k} = $v; } } # some local working variables my $count = 0; my $prefix = q{}; my $tmp = q{}; # handle negatives if ( $num < 0 ) { $num = abs $num; $prefix = '-'; } # adjust number to proper base $num *= $defaults{'base'}; # reduce number to something readable by factor specified while ( $num > $defaults{'factor'} ) { $num /= $defaults{'factor'}; $count++; } # optionally fudge "near" values up to next higher level if ( $defaults{'fudge'} ) { if ( $num > ( $defaults{'fudge'} * $defaults{'factor'} ) ) { $count++; $num /= $defaults{'factor'}; } } # no .[1-9] decimal on longer numbers for easier reading # only show decimal if format say so if ( length sprintf( "%.f", $num ) > $defaults{'max_human_length'} || !$defaults{'decimal'} ) { $tmp = sprintf( "%.0f", $num ); } else { $tmp = sprintf( "%.1f", $num ); # optionally hack trailing .0 $tmp =~ s/\.0$// unless $defaults{'decimal_zero'}; } return $prefix . $tmp . $defaults{'suffix'}->[$count]; } sub remark { my $priority = shift; my $message = shift; my $attributes = shift; chomp $message; my $attr_str; if ($attributes) { $attr_str = join ', ', map { $attributes->{$_} ||= q{}; "$_=$attributes->{$_}" } sort keys %$attributes; } print STDERR "$priority: $message" . ( $attr_str ? ": $attr_str" : q{} ) . "\n"; return 1; } sub print_help { print <<"HELP"; Usage: $0 [options] [directory ..] [-] 'du' output parser and reporter Options: -h/-? Display this message. -c xx Load configuration data from stated file. -o oo Use custom output template. -d dd Specify additional default key=value pairs for output template. -x File system mount points are not traversed. Run perldoc(1) on this script for additional documentation. HELP exit 100; } 1; __END__ =head1 NAME diskusage - du output parser and reporter =head1 SYNOPSIS Run C to report on home directory usage: $ diskusage $HOME Load configuration, specify custom output template: $ diskusage -c ~/.diskusage/prefs -o '%{humansize} %{path}\n' Read disk usage information from a file. Good when C must be run on remote system that does not have or cannot run C: $ du -a -k . > usage $ env DU_CMD=cat diskusage usage =head1 DESCRIPTION =head2 Overview Wrapper for the Unix C command to provide selective matching of certain areas on disk and better output control via a template. =head2 Normal Usage $ diskusage [options] [directory ..] [-] See L<"OPTIONS"> for details on the command line switches supported. The C command will be run on each directory specified on the command line or from a configuration file, and results printed to standard out according to the output template. The default output format is similar to that of C, but with a human readable size following the path. This allows easy sorting by size. $ diskusage . | sort -n 12 ./CVS 12.0K 28 . 28.0K Directory specifications can either be loaded from the B<-c> configuration file option, and passed in on standard input if a hyphen is the last option on the command line. Otherwise, one or more directory names will need to be specified as arguments to the script. To change the C command that is run, set the C environment variable. Bear in mind this script expects sizes to be reported in kilobytes (e.g. with the C<-k> option to C) for the C output option. $ env DU_CMD='du -ak' diskusage . =head1 OPTIONS This script currently supports the following command line switches: =over 4 =item B<-h>, B<-?> Prints a brief usage note about the script. =item B<-c> I Load configuration data from the specified file. =item B<-o> I Specify a custom output template. Values to be expanded on should be written as C<%{name}>, and ideally a C<\n> included at the end of the template to print a newline between records. A newline will be added to the output template unless the template ends with whitespace. Currently available parameters for expansion include: size - to emulate regular du(1) output with path humansize - reports sizes in a human readable format epoch - when the du(1) command was run in epoch time parentdir - root directory for current du(1) command subpath - current directory under parentdir =item B<-d> I Specify additional defaults to be used in the output template. For instance, this allows the hostname to be added to the output. $ diskusage -c /etc/diskusage/prefs -d hostname=`hostname` \ -o '%{hostname}:%{path} %{size} %{epoch}\n' =item B<-x> File system mount points are not traversed. =back =head1 CONFIGURATION The configuration file currently consists of one or more lines that list a required directory on disk to scan, followed by optional subpath expressions to report on. Blank and commented lines will be ignored. Currently, the subpath expressions are either converted to regular expressions, or used directly as regular expressions. To specify a regular expression, ensure the string begins with C<^> or ends with C<$>. Otherwise, these will be added by the script, and any C<*> replaced with something that matches all directories under the current one. # scan home partition, report on all user directories therein /home * # report on various web areas /var/www * site/* logs/*/* # report size on directories that being with vowel to show a regular # expression example (to encourage use of the simple glob) /usr "^/[aeiou][^/]+/?$" By default, the total size used by each parent directory will be reported. =head1 BUGS =head2 Reporting Bugs Newer versions of this script may be available from: http://sial.org/code/perl/ If the bug is in the latest version, send a report to the author. Patches that fix problems or add new features are welcome. =head2 Known Issues No known issues. =head1 SEE ALSO du(1), perl(1) =head1 AUTHOR Jeremy Mates, http://sial.org/contact/ =head1 COPYRIGHT The author disclaims all copyrights and releases this script into the public domain. =head1 VERSION $Id: diskusage,v 1.16 2006/06/01 22:15:51 jmates Exp $ =cut