#!/usr/bin/perl -w use strict; use XML::LibXML; use FileHandle; # MPIWG libraries use lib '/usr/local/mpiwg/archive'; use MPIWGStor; # make output unbuffered $|=1; ####################################################### # internal parameters # # program version my $version = "0.3.4 (11.8.2009)"; # read command line parameters my $args = parseargs; # debug level $debug = (exists $$args{'debug'}) ? ($$args{'debug'}) : 0; # rewrite XML file (necessary for archive date!) my $fix_xml = 1; my $xml_changed = 0; # XML namespace (not really implemented!) my $namespace = ""; # archive name (archive-path element, usually == $docdir) my $archname; # archive storage date my $archdate; ####################################################### # external programs # my $archprog = "/opt/tivoli/tsm/client/ba/bin/dsmc"; if (! -x $archprog) { logger("ABORT", "TSM client program '$archprog' missing!!"); exit 1; } my $checkprog = "/usr/local/mpiwg/archive/archivecheck"; if (! -x $checkprog) { logger("ABORT", "archive checking program '$checkprog' missing!!"); exit 1; } # log file for archiver my $log_file = "/var/tmp/unarchiver.log"; if (! open LOG, ">>$log_file") { logger("ABORT", "unable to write log file '$log_file'!!"); exit 1; } LOG->autoflush(1); ####################################################### # check parameters that were passed to the program # my $docdir = $$args{'path'}; if (! $docdir) { print "ABORT: no document directory given!\n"; exit 1; } # strip trailing slashes $docdir =~ s/\/$//; if (! -d $docdir) { print "ABORT: document directory \'$docdir\' doesn't exist!\n"; exit 1; } my $metafile = "$docdir/index.meta"; if (! -f $metafile) { print "ABORT: metadata index file \'$metafile\' doesn't exist!\n"; exit 1; } ####################################################### # internal variables # # number of errors my $errcnt = 0; # number of warnings my $warncnt = 0; ####################################################### # subroutines # # construct document's parent dir sub get_parent { my ($dirname) = @_; my $dirparent = $dirname; $dirparent =~ s!/[^/]+$!!; return $dirparent; } # # $files = read_resource_meta($rootnode) # # checks general resource meta information and reads the list of files # sub read_resource_meta { my ($rootnode) = @_; my %files; # # archive path # # get archive-path $archname = sstrip($rootnode->findvalue('child::archive-path')); if (! $archname) { logger("ABORT", "archive-name element missing!!"); exit 1; } # # files # my @filenodes = $rootnode->findnodes('child::file'); foreach my $fn (@filenodes) { my $name = sstrip($fn->findvalue('child::name')); my $path = sstrip($fn->findvalue('child::path')); logger("DEBUG", "FILE: ($path)$name"); my $f = ($path) ? "$path/$name" : "$name"; $files{$f} = [$name]; } # # dirs # my @dirnodes = $rootnode->findnodes('child::dir'); foreach my $fn (@dirnodes) { my $name = sstrip($fn->findvalue('child::name')); my $path = sstrip($fn->findvalue('child::path')); logger("DEBUG", "DIR: ($path)$name"); my $f = ($path) ? "$path/$name" : "$name"; $files{$f} = [$name]; } # # archive-storage-date # my $archdate = $rootnode->find('child::archive-storage-date'); if ($archdate) { logger("INFO", "archive storage date: $archdate"); } else { logger("ERROR", "archive storage date missing!"); $errcnt++; } # # archive-recall-date # my $recalldatenode = ($rootnode->find('child::archive-recall-date'))->get_node(1); if ($recalldatenode) { print "INFO: archive recall date exists!\n"; # delete old date $recalldatenode->removeChildNodes; } else { # create new storage date node $recalldatenode = $rootnode->addNewChild($namespace, "archive-recall-date"); # move after archive-path $rootnode->insertAfter($recalldatenode, ($rootnode->find('child::archive-storage-date'))->get_node(1)); } $recalldatenode->appendTextNode(scalar localtime); $xml_changed++; return \%files; } # # $num_files = run_retrieve($docdir, $docmount, \%files) # # Runs the retriever program on $docdir and returns the number of unarchived files. # All filenames are put in %files. # $docmount is the mount point of the doc partition in cases when the new mount point # is different. # # Sample output: # (old!) Retrieving 17,234 /mpiwg/archive/data/test/auto_titit_123/pageimg/essen-wind1.jpg [Done] # Retrieving 42,406,326 /mpiwg/archive/data/library/B980G582/raw/00015.tif --> /mpiwg/archive/data/library/B980G582/raw/00015.tif [Done] sub run_retrieve { my ($archdir, $archmount, $files) = @_; my $archparent; if ($archmount eq $archdir) { # no explicit mount point $archparent = get_parent($archdir); } else { # destination dir is mount point $archparent = $archmount; } logger("INFO", "looking for archives in $archmount..."); print LOG "START unarchive $version on ", scalar localtime, "\n"; my $archcmd = $archprog; $archcmd .= " retrieve -subdir=yes -replace=all -ifnewer"; $archcmd .= " -description='$archname'"; # archive name $archcmd .= " '$archmount/'"; # archive mount point $archcmd .= " '$archparent/'"; # destination dir name logger('INFO', "querying TSM server for $archmount, please wait..."); my $archcnt = 0; my $numfiles = 0; print LOG "CMD: $archcmd\n"; if (open ARCH, "$archcmd 2>&1 |") { while () { chomp; print LOG "ARCH: $_\n"; if (/ Retrieving \s+([\d,]+) # size \s+(\S+) # file name \s+--> \s+(\S+) # destination file name \s+\[Done\] /x) { my $size = $1; my $file = $2; $size =~ s/,//g; logger("DEBUG", " RETRIEVE: file '$file'"); $archcnt++; if ($$files{$file}) { logger("WARNING", "file $file seems to be archived multiple times."); $warncnt++; } $$files{$file} = [$size]; } } $numfiles = (scalar keys %$files); logger("INFO", "$archcnt archives of $numfiles files (in $archmount)."); } else { logger("ABORT", "unable to start archive command '$archcmd'!!"); exit 1; } return $numfiles; } # # check_files(\%files_to_retrieve, \%retrieved_files) # # compares the list of archived and retrieved files # sub check_files { my ($to_retrieve, $retrieved) = @_; my $nt = scalar keys %$to_retrieve; my $na = scalar keys %$retrieved; foreach my $ft (sort keys %$to_retrieve) { my $fp = "$docdir/$ft"; #logger("DEBUG", " fp: $fp"); if ($$retrieved{$fp}) { logger("DEBUG", "$ft retrieved OK"); $$retrieved{$fp}->[1] = "OK"; } else { logger("ERROR", "file entry '$ft' missing from archive!"); $errcnt++; } } foreach my $fa (sort keys %$retrieved) { if (! $$retrieved{$fa}->[1]) { my ($fn, $fp) = split_file_path($fa); if ($index_files{$fn}) { logger("DEBUG", "$fa ignored"); $na--; } else { logger("WARNING", "$fa retrieved but not in list!"); $warncnt++; } } } if ($nt > $na) { logger("WARNING", "less files were retrieved ($na vs. $nt)!"); $warncnt++; } elsif ($na > $nt) { logger("WARNING", "more files were retrieved ($na vs. $nt)!"); $warncnt++; } } ####################################################### # main # logger("INFO", "unarchiver $version"); # make shure the right user is running this program my $user = getlogin || getpwuid($<); if (($user)&&($user ne "archive")&&($user ne "root")) { logger("ABORT", "you must be archive or root user to run this program!"); exit 1; } # use checkarchive first if (system("$checkprog $docdir >/dev/null") == 0) { logger("INFO", "archive \"$docdir\" check OK"); } else { logger("ABORT", "archive \"$docdir\" check failed!!"); exit 1; } # read index.meta file my ($document, $rootnode) = read_xml($metafile); # check index file my $archived_files = read_resource_meta($rootnode); my $num_archived_files = scalar keys %$archived_files; # check for .archived file if (-f "$docdir/.archived") { logger("INFO", ".archived file exists."); } else { logger("WARNING", "no .archived file!"); $warncnt++; } logger("INFO", "$num_archived_files files to retrieve."); # save current index.meta park_file($metafile); # retrieve my %retrieved_files = (); my $archcnt = 0; if ($docdir =~ /\/mpiwg\/archive\/data\/(.*)/) { # TSM needs two different paths because of historical mount points :-( # try the new one first $archcnt = run_retrieve($docdir, "/mpiwg/archive", \%retrieved_files); if ($archcnt == 0) { # and then the old one $archcnt = run_retrieve($docdir, "/mpiwg/archive/data", \%retrieved_files); } } else { # otherwise we assume that it works $archcnt += run_retrieve($docdir, $docdir, \%retrieved_files); } # restore current index.meta unpark_file($metafile); if ($archcnt == 0) { logger("ABORT", "no files retrieved!!"); exit 1; } logger("INFO", "$archcnt files retrieved"); # check list of archived files check_files($archived_files, \%retrieved_files); # rewrite index.meta file write_xml($document, $metafile); logger("INFO", "$warncnt warnings"); logger("INFO", "$errcnt errors"); if ($errcnt == 0) { logger("DONE", "$archcnt archived files retrieved"); } else { logger("ABORT", "there were $errcnt errors!!"); exit 1; }