view archivecheck.pl @ 20:79c6618e8dfa

small fixes
author casties
date Tue, 20 Sep 2005 19:30:34 +0200
parents 320c4b93bf39
children 24d9dd63ae93
line wrap: on
line source

#!/usr/local/bin/perl -w

use strict;

use XML::LibXML;

# MPIWG libraries
use lib '/usr/local/mpiwg/archive';
use MPIWGStor;

# make output unbuffered
$|=1;

#######################################################
# internal parameters
#

# program version
my $version = "0.4.1 (15.2.2005)";

# read command line parameters
my $args = parseargs;

# debug level
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;

# XML namespace (not really implemented!)
my $namespace = "";

# archive name (archive-path element, usually == $docdir)
my $archname;
# archive storage date
my $archdate;


#######################################################
# external programs
#
my $archprog = "/opt/tivoli/tsm/client/ba/bin/dsmc";
if (! -x $archprog) {
    logger("ABORT", "TSM client program '$archprog' missing!!");
    exit 1;
}
# my $checkprog = "/usr/local/mpiwg/archive/metacheck";
# if (! -x $checkprog) {
#     logge("ABORT", "meta data checking program '$checkprog' missing!!");
#     exit 1;
# }
# log file for archiver
my $log_file = "/var/tmp/archivecheck.log";
if (! open LOG, ">>$log_file") {
    logger("ABORT", "unable to write log file '$log_file'!!");
    exit 1;
}

#######################################################
# check parameters that were passed to the program
#
my $docdir = $$args{'path'};
if (! $docdir) {
    print "ABORT: no document directory given!\n";
    exit 1;
}
# strip trailing slashes
$docdir =~ s/\/$//;
if (! -d $docdir) {
    print "ABORT: document directory \'$docdir\' doesn't exist!\n";
    exit 1;
}

my $metafile = "$docdir/index.meta";
if (! -f $metafile) {
    print "ABORT: metadata index file \'$metafile\' doesn't exist!\n";
    exit 1;
}

#######################################################
# internal variables
#

# number of errors
my $errcnt = 0;
# number of warnings
my $warncnt = 0;

#######################################################
# subroutines
#


#
# $files = read_resource_meta($rootnode)
#
# checks general resource meta information and reads the list of files
#
sub read_resource_meta {
    my ($rootnode) = @_;
    my %files;
    #
    # archive path
    #
    # get archive-path
    $archname = sstrip($rootnode->findvalue('child::archive-path'));
    if (! $archname) {
	logger("ABORT", "archive-name element missing!!");
	exit 1;
    }

    #
    # files
    #
    my @filenodes = $rootnode->findnodes('child::file');
    foreach my $fn (@filenodes) {
	my $name = sstrip($fn->findvalue('child::name'));
	my $path = sstrip($fn->findvalue('child::path'));
	logger("DEBUG", "FILE: ($path)$name");
	my $f = ($path) ? "$path/$name" : "$name";
	$files{$f} = [$name];
    }

    #
    # dirs
    #
    my @dirnodes = $rootnode->findnodes('child::dir');
    foreach my $fn (@dirnodes) {
	my $name = sstrip($fn->findvalue('child::name'));
	my $path = sstrip($fn->findvalue('child::path'));
	logger("DEBUG", "DIR: ($path)$name");
	my $f = ($path) ? "$path/$name" : "$name";
	$files{$f} = [$name];
    }

    #
    # archive-storage-date
    #
    my $archdate = $rootnode->find('child::archive-storage-date');
    if ($archdate) {
	logger("INFO", "archive storage date: $archdate");
    } else {
	logger("ERROR", "archive storage date missing!");
	$errcnt++;
    }
    return \%files;
}


#
# fs_read_files($realdir, $docdir, \%files, \%dirs)
#
# reads all files and directories below $realdir and puts the
# files in %files and directories in %dirs
# $docdir is only for recursion, it should be empty when called 
# from outside
#
sub fs_read_files {
    my ($directory, $docdir, $files, $dirs) = @_;    
    my $cnt = 0;

    if (! opendir DIR, $directory) {
	return 0;
    }
    my @dirfiles = readdir DIR;
    foreach my $fn (@dirfiles) {
	# ignore names starting with a dot
	next if ($fn =~ /^\./);
	# ignore other silly files
	next if ($junk_files{$fn});

	$cnt++;
	my $f = "$directory/$fn";
	my $docf = ($docdir) ? "$docdir/$fn" : $fn;
	#print "fs_file: \"$f\"\n";
	if (-f $f) {
	    #print "  is file\n";
	    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
                      $atime,$mtime,$ctime,$blksize,$blocks)
                          = stat(_); 
	    $$files{$docf} = [$fn, $size, stime($mtime)];
	    #logger("TEST", "fn $fn, size $size, mtime $mtime");
	} elsif (-d _) {
	    #print "  is dir\n";
	    $$dirs{$docf} = $fn;
	    # recurse into directory
	    $cnt += fs_read_files($f, $docf, $files, $dirs);
	}
    }
    return $cnt;
}


#
# $archcnt = run_query($dirquery, \%files)
#
# runs the archiver program on $dirquery and adds to the hash of archived files
#
# Sample output:
#         20,345  B  08/06/03   17:17:02    /mpiwg/archive/data/proyectohumboldt/webb_histo_fr_01_1839/index.meta Never /mpiwg/archive/data/proyectohumboldt/webb_histo_fr_01_1839
#
sub run_query {
    my ($dirquery, $files) = @_;
    print LOG "START checkarchive $version ", scalar localtime, "\n";
    my $archcmd = $archprog;
    $archcmd .= " query archive -subdir=yes";
    $archcmd .= " -description='$archname'";
    $archcmd .= " '$dirquery'";

    my $archcnt = 0;
    print LOG "CMD: $archcmd\n";
    if (open ARCH, "$archcmd 2>&1 |") {
	while (<ARCH>) {
	    chomp;
	    print LOG "ARCH: $_\n";
	    if (/
		\s*([\d,]+)    # size
		\s+(\w+)       # unit of size
		\s+([\d\/]+)   # date mm\/dd\/yy
		\s+([\d:]+)    # time
		\s+(\S+)       # file name
		\s+(\w+)       # expiry
		\s+(\S+)       # archive label
		/x) {
		my $size = $1;
		my $sunit = $2;
		my $date = $3;
		my $time = $4;
		my $file = $5;
		my $exp = $6;
		my $label = $7;
		$size =~ s/,//g;
		$date = ymd_date($date);
		logger("DEBUG", "  QUERY: file '$file'");
		$archcnt++;
		if ($$files{$file}) {
		    logger("DEBUG", "file $file seems to be archived multiple times: $time $date");
		    #$warncnt++;
		}
		if (length $file <= length $docdir) {
		    logger("DEBUG", "not below document dir: $file");
		    next;
		}
		$$files{$file} = [$size, "$date $time"];
	    }
	}
    } else {
	logger("ABORT", "unable to start archive command '$archcmd'!!");
	exit 1;
    }

    return $archcnt;
}


#
# check_files(\%files_to_archive, \%archived_files)
#
# compares the list of archived and to be archived files
#
sub check_files {
    my ($to_archive, $archived) = @_;

    my $nt = scalar keys %$to_archive;
    my $na = scalar keys %$archived;

    foreach my $ft (sort keys %$to_archive) {
	my $fp = "$docdir/$ft";
	#logger("DEBUG", "  fp: $fp");
	if ($$archived{$fp}) {
	    logger("DEBUG", "$ft archived OK");
	    $$archived{$fp}->[2] = "OK";
	} else {
	    logger("ERROR", "file entry '$ft' missing from archive!");
	    $errcnt++;
	}
    }

    foreach my $fa (sort keys %$archived) {
	if (! $$archived{$fa}->[2]) {
	    my ($fn, $fp) = split_file_path($fa);
	    if ($index_files{$fn}) {
		logger("DEBUG", "$fa ignored");
		$na--;
	    } else {
		logger("WARNING", "$fa archived but not in list!");
		$warncnt++;
	    }
	}
    }

    if ($nt > $na) {
	logger("WARNING", "less files were archived ($na vs. $nt)!");
	$warncnt++;
    } elsif ($na > $nt) {
	logger("WARNING", "more files were archived ($na vs. $nt)!");
	$warncnt++;
    }

}

#
# compare_files(\%files_on_disk, \%archived_files)
#
# compares the list of archived files and files on disk
#
sub compare_files {
    my ($fs_files, $archived) = @_;

    foreach my $ft (sort keys %$fs_files) {
	next if ($index_files{$ft});
	my $fp = "$docdir/$ft";
	#logger("DEBUG", "  fp: $fp");
	if ($$archived{$fp}) {
	    next if ($index_files{$ft});
	    
	    my $asize = $$archived{$fp}[0];
	    my $atime = $$archived{$fp}[1];
	    my $fsize = $$fs_files{$ft}[1];
	    my $ftime = $$fs_files{$ft}[2];
	    if ($asize != $fsize) {
		logger("ERROR", "archived $ft ($asize) and file on disk ($fsize) have different size!");
		$errcnt++;
	    } elsif ($atime lt $ftime) {
		logger("ERROR", "archived $ft ($atime) is older than file on disk ($ftime)!");
		$errcnt++;
	    } else {
		logger("WARNING", "archived file $ft still on disk");
		$warncnt++;
	    }
	} else {
	    logger("ERROR", "file on disk '$ft' is not in archive!");
	    $errcnt++;
	}
    }
}



#######################################################
# main
#

logger("INFO", "archivecheck $version");

# make shure the right user is running this program
my $user = getlogin;
if (not (($user eq "archive")||($user eq "root"))) {
    logger("ABORT", "you must be archive or root user to run this program!");
    exit 1;
}

# read index.meta file
my ($document, $rootnode) = read_xml($metafile);

# check file and add archive date
my $files_to_archive = read_resource_meta($rootnode);

# check for .archived file
if (-f "$docdir/.archived") {
    logger("INFO", ".archived file exists.");
} else {
    logger("WARNING", "no .archived file!");
    $warncnt++;
}

# check archive
my %archived_files = ();
my $archcnt = 0;
if ($docdir =~ /\/mpiwg\/archive\/data\/(.*)/) {
    # TSM needs two different paths because of historical mount points :-(
    my $docdir1 = "/mpiwg/archive/data/";
    $archcnt += run_query($docdir1, \%archived_files);
    my $docdir2 = "/mpiwg/archive/";
    $archcnt += run_query($docdir2, \%archived_files);
} else {
    $archcnt += run_query("$docdir/", \%archived_files);
}
logger("INFO", "$archcnt archives of " . (scalar keys %archived_files) . " files.");

my $num_arch_files = (scalar keys %archived_files);
if ($num_arch_files == 0) {
    logger("ABORT", "no archive of this directory!!");
    exit 1;
}
logger("INFO", "$num_arch_files files archived");

# check list of archived files
check_files($files_to_archive, \%archived_files);

# read files from filesystem
my %fsfiles;
my %fsdirs;
my $num_fs_files = fs_read_files($docdir, "", \%fsfiles, \%fsdirs);

logger("INFO", "$num_fs_files files still on disk!");
if ($num_fs_files > 0) {
    compare_files(\%fsfiles, \%archived_files);
}

logger("INFO", "$warncnt warnings");
logger("INFO", "$errcnt errors");
if ($errcnt == 0) {
    logger("DONE", "" . (scalar keys %archived_files) . " archived files OK");
    exit 0;
} else {
    logger("ABORT", "there were $errcnt errors!!");
    exit 1;
}