#!/usr/bin/perl -wT # # $Id: unseen,v 2.5 2005/09/05 20:29:58 jmates Exp $ # # The author disclaims all copyrights and releases this script into the # public domain. # # Lists new (unseen) files found under the specified directories. # # Run perldoc(1) on this script for additional documentation. use strict; use Digest::SHA1 (); my $ctx = Digest::SHA1->new; use File::Basename qw(basename fileparse); use File::Find qw(find); use File::Spec (); use Time::Local (); # TODO replace with DB File or SQL for efficiency? use YAML (); my $suffix_char = '.'; my $suffix_re = qr/(? $seen_file, errno => $@ } ); exit 105; } } # find date ranges for new files to skip (-T "today") or files older # than allowed by the -w "window" option my $midnight = Time::Local::timelocal( qw(0 0 0), (localtime)[ 3 .. 5 ] ); my $window = ( $midnight || $^T ) - duration2seconds( $opts{w} || '2w' ); undef $midnight unless $opts{T}; # throw out older entries from seen data for my $key ( keys %$seen ) { delete $seen->{$key} if $seen->{$key}->{mtime} < $window; } # read in tags to use my @tags = qw(default); if ( exists $opts{t} ) { @tags = $opts{t} =~ m/ ([\w.-]+) /gx; } # and how to build the index (default: only off digest) my %index = qw(digest 1); if ( exists $opts{i} ) { %index = (); my @keys = $opts{i} =~ m/ ([\w.-]+) /gx; @index{@keys} = (1) x @keys; # if tags, add to index automagically $index{tag} = 1 if exists $opts{t}; } # output template, with expansion of '\n' to literal char my $output_tmpl = '%{dirname}/%{filename}\n'; $output_tmpl = '%{tag} %{dirname}/%{filename}\n' if exists $opts{t}; if ( exists $opts{o} ) { $output_tmpl = $opts{o}; } $output_tmpl =~ s/(\\.)/qq!"$1"!/eeg; # look for files my $parent; for (@dirs) { $parent = $_; find { wanted => \&process, no_chdir => 1 }, $parent; } # update seen index # TODO replace with DB File or SQL? eval { YAML::DumpFile $seen_file, $seen; }; if ($@) { chomp $@; remark( 'error', 'could not save index', { file => $seen_file, errno => $@ } ); exit 106; } exit; # File::Find file handler sub process { my @stat = stat; if ( exists $opts{l} and -d _ and $_ ne $parent ) { $File::Find::prune = 1; return; } # TODO support for directories, so can spot new/removed dirs, or is # that getting outside the domain of this script too much? return unless -f _; my %filedata; # skip logfiles outside of processing window by modification time $filedata{mtime} = $stat[9]; return if $filedata{mtime} < $window; # skip files modified today if needed if ($midnight) { return if $filedata{mtime} > $midnight; } # untaint filename (paranoia) my $filename = untaintpath($_); unless ( defined $filename ) { remark( 'error', 'could not untaint filename', { file => $_ } ); exit 103; } # calculate various data from filename %filedata = ( %filedata, %{ filedata($filename) } ); if ( exists $index{digest} ) { $filedata{digest} = filedigest($filename); unless ( defined $filedata{digest} ) { remark( 'notice', 'skipping as no checksum', { file => $filename } ); return; } } # filename duplicate supression on output; any duplicates in output # will be due to different files having the same output such as when # the output template excludes the file filename or digest information my %printed; for my $tag (@tags) { $filedata{tag} = $tag; $filedata{index} = join '.', map { $filedata{$_} || q{} } sort keys %index; next if exists $seen->{ $filedata{index} }; ( my $output = $output_tmpl ) =~ s/ %{ (\w+) } / $filedata{$1} || q{} /egx; print $output unless exists $printed{$output}; $printed{$output} = 1; # TODO replace with DB File or SQL for efficiency? # # need mtime stored in index, as used to retire old records $seen->{ $filedata{index} } = { map { exists $filedata{$_} ? ( $_ => $filedata{$_} ) : () } qw(filepath mtime) }; } } # untaints data, restricts what filenames may contain. Accepts 1 or more # file paths, returns undef or 1 or more paths depending on the results # of the tests. sub untaintpath { my @paths = map { # limit to allowed characters, skip names with ../.. runs m,^([A-Za-z0-9_./-]+)$,; ( defined $1 and $1 eq $_ ) ? ( not m{\Q../..} ) ? $1 : () : (); } @_; return wantarray ? @paths : $paths[0]; } # takes filename, returns sha1 digest of file contents using # global object sub filedigest { my $file = shift; unless ( open FILE, "< $file" ) { remark( 'warning', 'could not open file', { file => $file, errno => $! } ); return; } binmode FILE; # b64digest method resets object, so can reuse my $digest = $ctx->addfile(*FILE)->b64digest; close FILE; return $digest; } # parses out various file information such as the dirname, filename, # file name without suffix, suffix. Used by rules to figure out what to # do with a particular file. sub filedata { my $filename = shift; my %filedata; ( $filedata{filename}, $filedata{dirname}, undef ) = fileparse $filename; $filedata{dirname} = File::Spec->rel2abs( $filedata{dirname} ); # KLUGE strip trailing / from path to avoid // in paths $filedata{dirname} =~ s,/+$,,; my @portions = split /$suffix_re/, $filedata{filename}; $filedata{file} = $portions[0]; if ( @portions > 1 ) { local $" = $suffix_char; $filedata{suffix} = "@portions[1..$#portions]"; } $filedata{filepath} = File::Spec->catfile( $filedata{dirname}, $filedata{filename} ); return \%filedata; } # for logging things sub remark { my $priority = shift; my $message = shift; my $attributes = shift || {}; chomp $message; my $attr_str; if ( keys %$attributes ) { $attr_str = join ', ', map { $attributes->{$_} ||= q{}; "$_=$attributes->{$_}" } sort keys %$attributes; # escape wacky characters to avoid messing up things $attr_str =~ s/ ( [^[:print:]\t] ) / sprintf "\\%03o", ord $1 /egx; } print STDERR "$priority: $message" . ( $attr_str ? ": $attr_str" : q{} ) . "\n"; return 1; } # takes duration such as "2m3s" and returns number of seconds. sub duration2seconds { my $tmpdur = shift; my $seconds; # how to convert short human durations into seconds my %factor = ( y => 31536000, w => 604800, d => 86400, h => 3600, m => 60, s => 1, ); # assume raw seconds for plain number if ( $tmpdur =~ m/^\d+$/ ) { $seconds = $tmpdur * 60; } elsif ( $tmpdur =~ m/^[ywdhms\d\s]+$/ ) { # match "2m 5s" style input and convert to seconds while ( $tmpdur =~ m/(\d+)\s*([ywdhms])/g ) { $seconds += $1 * $factor{$2}; } } else { remark( 'error', 'unknown characters in duration', { duration => $tmpdur } ); exit 104; } unless ( defined $seconds and $seconds =~ m/^\d+$/ ) { remark( 'error', 'unable to parse duration', { duration => $tmpdur } ); exit 104; } return $seconds; } # clean up env for taint mode ("perldoc perlsec" for more information) sub BEGIN { delete @ENV{qw(IFS CDPATH ENV BASH_ENV)}; $ENV{PATH} = '/bin:/usr/bin'; } # a generic help blarb sub print_help { print <<"HELP"; Usage: $basename [options] -s seen-index directory [directory ...] Lists new (unseen) files under specified directory(-ies). Options supported: -h/-? Display this message. -s ss Path to list of previously seen filenames index. -w dd Window inside which files will be considered against index. -T Skip files modified today. -t tt Custom tags to optionally add to index calculation. -i ii File data to base index on. -o oo Custom output template. -l Do not recurse into sub directories. Run perldoc(1) on this script for additional documentation. HELP exit 100; } __END__ =head1 NAME unseen - lists unseen files under specified directories =head1 SYNOPSIS List unseen files under /var/log/archive new within the last three weeks, using /var/log/archive.seen as the seen archive file. $ unseen -w 3w -s /var/log/archive.seen /var/log/archive =head1 DESCRIPTION =head2 Overview For listing unseen files under directory trees. Written to handle case of finding unprocessed logfiles (limited numbers of files) under log directories for passing to log processing utilities such as swatch or logwatch. Files are recorded as seen with aid of a seen file, which is indexed by default using the SHA1 digest of the file contents. A processing window option configures how far back in time the script is allowed to report on, to keep the index file and SHA1 digest generation needs from growing over time. As a consequence of the default SHA1 digest index, the script will list both files that are new to the search directory, or ones that have been modified since the previous run of the script. However, a list of file data terms to index off of may be specified, such as the filename or the file modification time, in the event that SHA1 digests are not needed or are too slow for the application in question. This script currently will not be efficient for large numbers of files, due to use of SHA1 digest and storing the index in memory and on file with YAML. =head2 Normal Usage $ unseen [options] -s seen-index directory [directory ...] See L<"OPTIONS"> for details on the command line switches supported. Either a single or multiple search directories can be specified; directories will be searched recursively for regular files. Standard output will contain a list of unseen files, in a format based on the output template in use. Output will not be duplicated on a per- file basis. Errors will be printed to standard error; the script will exit with a non-zero exit code if something is wrong outside of the file indexing loop, and will issue warnings and attempt to continue running if there are problems inside the file indexing loop. =head1 OPTIONS This script currently supports the following command line switches: =over 4 =item B<-h>, B<-?> Prints a brief usage note about the script. =item B<-s> I Path to load seen file data from, or save to after listing the unseen files for the run in question. =item B<-w> I Prevents files older than midnight minus I from being considered with SHA1 checksum comparisons against the seen file index. The duraction can either be in raw seconds or a short-hand "2m5s" format. The default is to skip files older than two weeks. Should the script fail to calculate midnight, the script launch time will be used. =item B<-T> Skip files modified after midnight (00:00) of current day. Good for excluding active logfiles, assuming daily rotation. =item B<-t> I Specify custom tags. Use these to support different log analysis programs, such that each program has a different tag. This way, a new analysis program can be added by specifying a new tag, resulting in analysis of prior unseen files by the new program. If new programs need not be run over the past files, then do not use the tag option. Tags are current word characters, plus hypens and periods. They may be delimited with any other character, such as commas or spaces, though need to be specified as a single unit as far as the shell is concerned, if relevant. -t swatch-mail,swatch-messages,logwatch =item B<-i> I Specify what fields should be used to construct the unique key for each file. The default index is C, which is the SHA1 digest algorithm, along with the tag name if the B<-t> option is being used. For speed, and where the filenames are static, use the filename plus the file modification time. -i dirname,filename,mtime Using the B<-t> tag option will always add the C field to the list. Currently available fields include: digest SHA1 digest algorithm dirname Directory containing file file Filename without suffix, if any filename Full filename mtime File modification time suffix Filename suffix, if any tag Added automatically by -t option =item B<-o> I Specify a custom template for the display of what files have not been seen. By default, the output template is C<%{dirname}/%{filename}\n>, or C<%{tag} %{dirname}/%{filename}\n> with the B<-t> tag option set. As an example, an output template that separates the directory name, and includes the file digest and modification time: -t '%{tag} %{dirname} %{filename} %{digest} %{mtime}\n' The template uses the same fields that are available to the B<-i> index fields option. Backslashed characters such as C<\n> will be converted to their literal equivalent, such as a newline. Some form of a newline or other delimiter at the end of the custom template is highly advised. =item B<-l> Do not recurse into subdirectories below the parent directory. =back =head1 SECURITY Taint mode is enabled by default. The script will die if the I or supplied directories fail an untaint check. =head1 BUGS =head2 Reporting Bugs Newer versions of this script may be available from: http://sial.org/code/perl/ If the bug is in the latest version, send a report to the author. Patches that fix problems or add new features are welcome. =head2 Known Issues Index file is in YAML, which is loaded into memory. Keeping track of large numbers of files will thus be inefficient. One possible solution would be to move the index to use a database such as SQLite or DB File. Search for C statements in the source for areas known to need work. =head1 SEE ALSO perl(1), Digest::SHA1, YAML =head1 AUTHOR Jeremy Mates, http://sial.org/contact/ =head1 COPYRIGHT The author disclaims all copyrights and releases this script into the public domain. =head1 VERSION $Id: unseen,v 2.5 2005/09/05 20:29:58 jmates Exp $ =head1 SCRIPT CATEGORIES Utilities UNIX/System_administration =cut