Mercurial > hg > foxridge-archiver
changeset 0:30497c6a3eca
Initial revision
author | casties |
---|---|
date | Thu, 17 Jun 2004 17:58:42 +0200 |
parents | |
children | 9143a0c4c678 ea3972762520 |
files | HarvestmetaHandler.pm MPIWGStor.pm archivecheck.pl archiver.pl harvestmeta.pl metacheck.pl unarchiver.pl |
diffstat | 7 files changed, 2197 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/HarvestmetaHandler.pm Thu Jun 17 17:58:42 2004 +0200 @@ -0,0 +1,100 @@ +# +# SAX handler for harvestmeta +# + +package HarvestmetaHandler; + +use strict; + +use base qw(XML::SAX::Base); + +use lib '/usr/local/mpiwg/archive'; +use MPIWGStor; + +my @currElemPath; +my $currElem; +my $currText; +my $currAttrib; +my @elements; + +sub getData { + return @elements; +} + +sub start_document { + my ($self, $doc) = @_; + # process document start event + #logger('DEBUG', "startdoc: $self, $doc"); + @currElemPath = (); + $currElem = ""; + $currText = ""; + $currAttrib =""; + @elements = (); +} + +sub start_element { + my ($self, $el) = @_; + # process element start event + #logger('DEBUG', "startelem: $self, $el"); + # check if the last element needs to be finished + if ($currElem) { + my $elem = join "/", @currElemPath; + push @elements, [$elem, "", $currAttrib]; + } + # element name is either LocalName or Name + my $name = $$el{'LocalName'}; + $name = $$el{'Name'} unless ($name); + #logger('DEBUG', " name: $name"); + # assemble attributes string + $currAttrib =""; + foreach $a (values %{$$el{'Attributes'}}) { + my $n = $$a{'LocalName'}; + $n = $$a{'Name'} unless ($n); + my $v = $$a{'Value'}; + $currAttrib .= "$n=\"$v\" "; + } + # start element name + push @currElemPath, $name; + $currElem = $name; + $currText = ""; +} + +sub end_element { + my ($self, $el) = @_; + # process element end event + #logger('DEBUG', "endelem: $self, $el"); + # check element name + my $name = $$el{'LocalName'}; + $name = $$el{'Name'} unless ($name); + my $lastag = $currElemPath[$#currElemPath]; + if ($lastag ne $name) { + logger('ERROR', "closing tag '$lastag' doesn't match '$name'!"); + } + # assemble element path + my $elem = join "/", @currElemPath; + # strip whitespace from element content + $currText =~ s/^\s*//; + $currText =~ s/\s*$//; + if ($currText) { + # put pair in elements array + push @elements, [$elem, $currText, $currAttrib]; + #logger('DEBUG', " elem: $elem = $currText"); + } + # end element name + pop @currElemPath; + $currElem = ""; + $currText = ""; + $currAttrib =""; +} + +sub characters { + my ($self, $char) = @_; + # process character data event + #logger('DEBUG', "characters: $self, $char"); + # add to current content + $currText .= $$char{'Data'}; + #logger('DEBUG', " Text: $currText"); +} + + +1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MPIWGStor.pm Thu Jun 17 17:58:42 2004 +0200 @@ -0,0 +1,269 @@ +#!/usr/local/bin/perl -w + +# +# Library with definitions and common routines for MPIWG storage system +# scripts +# + +package MPIWGStor; + +use strict; +use vars qw(@ISA @EXPORT $VERSION + $debug %junk_files %index_files $index_file_perm $archived_index_file_perm + $file_perm); +use Exporter; + +$VERSION = 0.3; # ROC 24.9.2003 + +@ISA = qw(Exporter); + +@EXPORT = qw($debug %junk_files %index_files $index_file_perm $archived_index_file_perm + $file_perm + &parseargs &logger &stime &s2stime &ymd_date &split_file_path &sstrip + &valid_file_name &valid_dir_name &read_xml &write_xml); + +# debug level +$debug = 0; + +# junk files +%junk_files = ( + '.HSResource' => '.HSResource', + '.HSancillary' => '.HSancillary', + '.HSicon' => '.HSicon', + 'Network Trash Folder' => 'Network Trash Folder', + 'TheVolumeSettingsFolder' => 'TheVolumeSettingsFolder', + '.DS_Store' => '.DS_Store' + ); + +# filenames to not delete (but archive) +%index_files = ( + 'index.meta' => 'index.meta', + 'index.meta.old' => 'index.meta.old' + ); + +# default permissions for normal index files (rw-rw-r--) +$index_file_perm = 0664; +# default permissions for archived index files (rw-r--r--) +$archived_index_file_perm = 0644; +# default permissions for other files (rw-rw-r--) +$file_perm = 0664; + + +# +# $param_hash = parseargs; +# +# reads @ARGV and returns a hash with all options like "-option=value" +# and the last part of the string as "path" +# +sub parseargs { + my %opts; + foreach my $s (@ARGV) { + if ($s =~ /^-([^=]+)=*(.*)$/) { + $opts{$1} = ($2) ? $2 : $1; + } else { + $opts{'path'} = $s; + } + } + return \%opts; +} + + +# +# logger($level, $message) +# +# logs the $message (mostly to stdout) +# +sub logger { + my ($level, $message) = @_; + if ($debug || ($level ne "DEBUG")) { + print "$level: $message\n"; + } +} + +# +# $stime = stime($utime) +# +# format utime (seconds since epoch) into string +# representation: "YYYY/MM/DD HH:MM:SS" +# +sub stime { + my ($utime) = @_; + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = + localtime($utime); + my $yy = $year + 1900; + my $mm = $mon + 1; + my $stime = sprintf("%d/%02d/%02d %02d:%02d:%02d", + $yy, $mm, $mday, $hour, $min, $sec); + return $stime; +} + +# +# $stime = s2stime($stime2) +# +# format db-like time (2003-09-19 12:43:32+02) into string +# representation: "YYYY/MM/DD HH:MM:SS" +# +sub s2stime { + my ($s2time) = @_; + my $stime = ""; + if ($s2time =~ /(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)/) { + my ($year,$mon,$mday,$hour,$min,$sec) = ($1, $2, $3, $4, $5, $6); + $stime = sprintf("%d/%02d/%02d %02d:%02d:%02d", + $year, $mon, $mday, $hour, $min, $sec); + } + return $stime; +} + +# +# $date = ymd_date($olddate) +# +# converts a date string in mm/dd/yy format to yyyy/mm/dd format +# +sub ymd_date { + my ($mdy) = @_; + my $ydm; + if ($mdy =~ /(\d+)\/(\d+)\/(\d+)/) { + my $m = $1; + my $d = $2; + my $y = $3; + # extend yy to yyyy + $y = ($y < 70) ? (2000 + $y) : (($y < 100) ? ($y + 1900) : $y); + $ydm = sprintf "%d/%02d/%02d", $y, $m, $d; + } + return $ydm; +} + +# +# ($name, $path) = split_file_path($fn) +# +# splits the file path $fn into +# file name (the last path element) and path +# +sub split_file_path { + my ($fn) = @_; + + if ($fn =~ /^(.*)\/([^\/]+)$/) { + return ($2, $1); + } + return $fn; +} + + +# +# $name = sstrip($name) +# +# strips leading and trailing whitespace from $name +# +sub sstrip { + my ($name) = @_; + + if ($name =~ /\s*(.*)\s*/) { + return $1; + } + return $name; +} + + +# +# $valid = valid_file_name($name) +# +# checks if $name is a valid file name +# +sub valid_file_name { + return valid_name($_[0], 0); +} + + +# +# $valid = valid_dir_name($name) +# +# checks if $name is a valid directory name +# +sub valid_dir_name { + return valid_name($_[0], 1); +} + + +# +# $valid = valid_name($name, $mode) +# +# checks if $name is a valid file ($mode=0) or directory name (mode=1) +# +sub valid_name { + my ($name, $mode) = @_; + + # whitespace? + if ($name =~ /\s+/) { + return 0; + } + # invalid characters + if ($name !~ /^[-\w.\/]+$/) { + return 0; + } + # files have extension + if ($mode == 0) { + if ($name !~ /\.[-\w]+$/) { + return 0; + } + } + return 1; +} + + +# +# ($document, $rootnode) = read_xml($file) +# +# reads xml meta file $file +# returns root node element +# +sub read_xml { + my ($file) = @_; + my $document; + + my $parser = XML::LibXML->new(); + if (! $parser) { + logger("ABORT", "unable to create parser for metadata index file!!"); + exit 1; + } + + logger("INFO", "index file: $file"); + eval { $document = $parser->parse_file($file) }; + # catch parsing errors + if ($@) { + logger("ABORT", "XML syntax error in index file!!"); + exit 1; + } + my $rootnode = $document->documentElement; + logger("INFO", "root element: \'" . $rootnode->nodeName . "\'"); + + return ($document, $rootnode); +} + + +# +# write_xml($document, $file) +# +# saves the XML tree of $document to the file $file +# +sub write_xml { + my ($document, $filename) = @_; + # keep backup copy + if (-f $filename) { + if (! rename $filename, "$filename.old") { + logger("ABORT", "unable to change file $filename!"); + exit 1; + } + } + # write new file + if ($document->toFile($filename)) { + logger("INFO", "written new file $filename"); + chmod $index_file_perm, $filename; + } else { + logger("ABORT", "unable to write file $filename!"); + exit 1; + } +} + + +# module init +return 1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/archivecheck.pl Thu Jun 17 17:58:42 2004 +0200 @@ -0,0 +1,391 @@ +#!/usr/local/bin/perl -w + +use strict; + +use XML::LibXML; + +# MPIWG libraries +use lib '/usr/local/mpiwg/archive'; +use MPIWGStor; + +# make output unbuffered +$|=1; + +####################################################### +# internal parameters +# + +# program version +my $version = "0.3 (24.9.2003)"; + +# read command line parameters +my $args = parseargs; + +# debug level +$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; + +# XML namespace (not really implemented!) +my $namespace = ""; + +# archive name (archive-path element, usually == $docdir) +my $archname; +# archive storage date +my $archdate; + + +####################################################### +# external programs +# +my $archprog = "/opt/tivoli/tsm/client/ba/bin/dsmc"; +if (! -x $archprog) { + logger("ABORT", "TSM client program '$archprog' missing!!"); + exit 1; +} +# my $checkprog = "/usr/local/mpiwg/archive/metacheck"; +# if (! -x $checkprog) { +# logge("ABORT", "meta data checking program '$checkprog' missing!!"); +# exit 1; +# } +# log file for archiver +my $log_file = "/var/tmp/archivecheck.log"; +if (! open LOG, ">>$log_file") { + logger("ABORT", "unable to write log file '$log_file'!!"); + exit 1; +} + +####################################################### +# check parameters that were passed to the program +# +my $docdir = $$args{'path'}; +if (! $docdir) { + print "ABORT: no document directory given!\n"; + exit 1; +} +# strip trailing slashes +$docdir =~ s/\/$//; +if (! -d $docdir) { + print "ABORT: document directory \'$docdir\' doesn't exist!\n"; + exit 1; +} + +my $metafile = "$docdir/index.meta"; +if (! -f $metafile) { + print "ABORT: metadata index file \'$metafile\' doesn't exist!\n"; + exit 1; +} + +####################################################### +# internal variables +# + +# number of errors +my $errcnt = 0; +# number of warnings +my $warncnt = 0; + +####################################################### +# subroutines +# + + +# +# $files = read_resource_meta($rootnode) +# +# checks general resource meta information and reads the list of files +# +sub read_resource_meta { + my ($rootnode) = @_; + my %files; + # + # archive path + # + # get archive-path + $archname = sstrip($rootnode->findvalue('child::archive-path')); + if (! $archname) { + logger("ABORT", "archive-name element missing!!"); + exit 1; + } + + # + # files + # + my @filenodes = $rootnode->findnodes('child::file'); + foreach my $fn (@filenodes) { + my $name = sstrip($fn->findvalue('child::name')); + my $path = sstrip($fn->findvalue('child::path')); + logger("DEBUG", "FILE: ($path)$name"); + my $f = ($path) ? "$path/$name" : "$name"; + $files{$f} = [$name]; + } + + # + # dirs + # + my @dirnodes = $rootnode->findnodes('child::dir'); + foreach my $fn (@dirnodes) { + my $name = sstrip($fn->findvalue('child::name')); + my $path = sstrip($fn->findvalue('child::path')); + logger("DEBUG", "DIR: ($path)$name"); + my $f = ($path) ? "$path/$name" : "$name"; + $files{$f} = [$name]; + } + + # + # archive-storage-date + # + my $archdate = $rootnode->find('child::archive-storage-date'); + if ($archdate) { + logger("INFO", "archive storage date: $archdate"); + } else { + logger("ERROR", "archive storage date missing!"); + $errcnt++; + } + return \%files; +} + + +# +# fs_read_files($realdir, $docdir, \%files, \%dirs) +# +# reads all files and directories below $realdir and puts the +# files in %files and directories in %dirs +# $docdir is only for recursion, it should be empty when called +# from outside +# +sub fs_read_files { + my ($directory, $docdir, $files, $dirs) = @_; + my $cnt = 0; + + if (! opendir DIR, $directory) { + return 0; + } + my @dirfiles = readdir DIR; + foreach my $fn (@dirfiles) { + # ignore names starting with a dot + next if ($fn =~ /^\./); + # ignore other silly files + next if ($junk_files{$fn}); + + $cnt++; + my $f = "$directory/$fn"; + my $docf = ($docdir) ? "$docdir/$fn" : $fn; + #print "fs_file: \"$f\"\n"; + if (-f $f) { + #print " is file\n"; + my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, + $atime,$mtime,$ctime,$blksize,$blocks) + = stat(_); + $$files{$docf} = [$fn, $size, stime($mtime)]; + #logger("TEST", "fn $fn, size $size, mtime $mtime"); + } elsif (-d _) { + #print " is dir\n"; + $$dirs{$docf} = $fn; + # recurse into directory + $cnt += fs_read_files($f, $docf, $files, $dirs); + } + } + return $cnt; +} + + +# +# $%files = run_query +# +# runs the archiver program on $docdir and returns a list of archived files +# +# Sample output: +# 20,345 B 08/06/03 17:17:02 /mpiwg/archive/data/proyectohumboldt/webb_histo_fr_01_1839/index.meta Never /mpiwg/archive/data/proyectohumboldt/webb_histo_fr_01_1839 +# +sub run_query { + my %files; + print LOG "START checkarchive $version ", scalar localtime, "\n"; + my $archcmd = $archprog; + $archcmd .= " query archive -subdir=yes"; + $archcmd .= " -description='$archname'"; + $archcmd .= " '$docdir/'"; + + my $archcnt = 0; + print LOG "CMD: $archcmd\n"; + if (open ARCH, "$archcmd 2>&1 |") { + while (<ARCH>) { + chomp; + print LOG "ARCH: $_\n"; + if (/ + \s*([\d,]+) # size + \s+(\w+) # unit of size + \s+([\d\/]+) # date mm\/dd\/yy + \s+([\d:]+) # time + \s+(\S+) # file name + \s+(\w+) # expiry + \s+(\S+) # archive label + /x) { + my $size = $1; + my $sunit = $2; + my $date = $3; + my $time = $4; + my $file = $5; + my $exp = $6; + my $label = $7; + $size =~ s/,//g; + $date = ymd_date($date); + logger("DEBUG", " QUERY: file '$file'"); + $archcnt++; + if ($files{$file}) { + logger("WARNING", "file $file seems to be archived multiple times: $time $date"); + $warncnt++; + } + $files{$file} = [$size, "$date $time"]; + } + } + logger("INFO", "$archcnt archives of " . (scalar keys %files) . " files."); + } else { + logger("ABORT", "unable to start archive command '$archcmd'!!"); + exit 1; + } + + return \%files; +} + + +# +# check_files(\%files_to_archive, \%archived_files) +# +# compares the list of archived and to be archived files +# +sub check_files { + my ($to_archive, $archived) = @_; + + my $nt = scalar keys %$to_archive; + my $na = scalar keys %$archived; + + foreach my $ft (sort keys %$to_archive) { + my $fp = "$docdir/$ft"; + #logger("DEBUG", " fp: $fp"); + if ($$archived{$fp}) { + logger("DEBUG", "$ft archived OK"); + $$archived{$fp}->[2] = "OK"; + } else { + logger("ERROR", "file entry '$ft' missing from archive!"); + $errcnt++; + } + } + + foreach my $fa (sort keys %$archived) { + if (! $$archived{$fa}->[2]) { + my ($fn, $fp) = split_file_path($fa); + if ($index_files{$fn}) { + logger("DEBUG", "$fa ignored"); + $na--; + } else { + logger("WARNING", "$fa archived but not in list!"); + $warncnt++; + } + } + } + + if ($nt > $na) { + logger("WARNING", "less files were archived ($na vs. $nt)!"); + $warncnt++; + } elsif ($na > $nt) { + logger("WARNING", "more files were archived ($na vs. $nt)!"); + $warncnt++; + } + +} + +# +# compare_files(\%files_on_disk, \%archived_files) +# +# compares the list of archived files and files on disk +# +sub compare_files { + my ($fs_files, $archived) = @_; + + foreach my $ft (sort keys %$fs_files) { + next if ($index_files{$ft}); + my $fp = "$docdir/$ft"; + #logger("DEBUG", " fp: $fp"); + if ($$archived{$fp}) { + next if ($index_files{$ft}); + + my $asize = $$archived{$fp}[0]; + my $atime = $$archived{$fp}[1]; + my $fsize = $$fs_files{$ft}[1]; + my $ftime = $$fs_files{$ft}[2]; + if ($asize != $fsize) { + logger("ERROR", "archived $ft ($asize) and file on disk ($fsize) have different size!"); + $errcnt++; + } elsif ($atime lt $ftime) { + logger("ERROR", "archived $ft ($atime) is older than file on disk ($ftime)!"); + $errcnt++; + } else { + logger("ERROR", "archived file $ft still on disk"); + $errcnt++; + } + } else { + logger("ERROR", "file '$ft' on disk missing from archive!"); + $errcnt++; + } + } +} + + + +####################################################### +# main +# + +logger("INFO", "archivecheck $version"); + +# make shure the right user is running this program +my $user = getlogin; +if (($user ne "archive")&&($user ne "root")) { + logger("ABORT", "you must be archive or root user to run this program!"); + exit 1; +} + +# read index.meta file +my ($document, $rootnode) = read_xml($metafile); + +# check file and add archive date +my $files_to_archive = read_resource_meta($rootnode); + +# check for .archived file +if (-f "$docdir/.archived") { + logger("INFO", ".archived file exists."); +} else { + logger("WARNING", "no .archived file!"); + $warncnt++; +} + +# check archive +my $archived_files = run_query; + +my $num_arch_files = (scalar keys %$archived_files); +if ($num_arch_files == 0) { + logger("ABORT", "no archive of this directory!!"); + exit 1; +} +logger("INFO", "$num_arch_files files archived"); + +# check list of archived files +check_files($files_to_archive, $archived_files); + +# read files from filesystem +my %fsfiles; +my %fsdirs; +my $num_fs_files = fs_read_files($docdir, "", \%fsfiles, \%fsdirs); + +logger("INFO", "$num_fs_files files still on disk!"); +if ($num_fs_files > 0) { + compare_files(\%fsfiles, $archived_files); +} + +logger("INFO", "$warncnt warnings"); +logger("INFO", "$errcnt errors"); +if ($errcnt == 0) { + logger("DONE", "" . (scalar keys %$archived_files) . " archived files checked"); +} else { + logger("ABORT", "there were $errcnt errors!!"); + exit 1; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/archiver.pl Thu Jun 17 17:58:42 2004 +0200 @@ -0,0 +1,407 @@ +#!/usr/local/bin/perl -w + +use strict; + +use XML::LibXML; + +# MPIWG libraries +use lib '/usr/local/mpiwg/archive'; +use MPIWGStor; + +# make output unbuffered +$|=1; + +####################################################### +# internal parameters +# + +# program version +my $version = "0.5 (15.1.2004)"; + +# rewrite XML file (necessary for archive date!) +my $fix_xml = 1; +my $xml_changed = 0; +# XML namespace (not really implemented!) +my $namespace = ""; + +# archive name (archive-path element, usually == $docdir) +my $archname; +# archive storage date (now) +my $archdate = stime(time); + +# delete "junk" files before archiving +my $delete_junk_files = 1; + +# delete data files after archiving +my $delete_data_files = 1; + + +####################################################### +# external programs +# +my $archprog = "/opt/tivoli/tsm/client/ba/bin/dsmc"; +if (! -x $archprog) { + print "ABORT: TSM client program '$archprog' missing!!\n"; + exit 1; +} +my $checkprog = "/usr/local/mpiwg/archive/metacheck"; +if (! -x $checkprog) { + print "ABORT: meta data checking program '$checkprog' missing!!\n"; + exit 1; +} +# log file for archiver +my $log_file = "/var/log/mpiwg-archiver.log"; +if (! open LOG, ">>$log_file") { + print "ABORT: unable to write log file '$log_file'!!\n"; + exit 1; +} + +####################################################### +# check parameters that were passed to the program +# +if ($#ARGV < 0) { + print "ABORT: no document directory given!\n"; + exit 1; +} +my $docdir = $ARGV[0]; +# strip trailing slashes +$docdir =~ s/\/$//; +if (! -d $docdir) { + print "ABORT: document directory \'$docdir\' doesn't exist!\n"; + exit 1; +} +if (($#ARGV > 0)&&($ARGV[1] eq "-premigrate")) { + $delete_data_files = 0; +} + +my $metafile = "$docdir/index.meta"; +if (! -f $metafile) { + print "ABORT: metadata index file \'$metafile\' doesn't exist!\n"; + exit 1; +} + +####################################################### +# internal variables +# + +# number of errors +my $errcnt = 0; +# number of warnings +my $warncnt = 0; + +####################################################### +# subroutines +# + +# +# $files = read_resource_meta($rootnode) +# +# checks general resource meta information and reads the list of files +# +sub read_resource_meta { + my ($rootnode) = @_; + my %files; + # + # archive path + # + # get archive-path + $archname = MPIWGStor::sstrip($rootnode->findvalue('child::archive-path')); + if (! $archname) { + print "ABORT: archive-name element missing!!\n"; + exit 1; + } + + # + # files + # + my @filenodes = $rootnode->findnodes('child::file'); + foreach my $fn (@filenodes) { + my $name = MPIWGStor::sstrip($fn->findvalue('child::name')); + my $path = MPIWGStor::sstrip($fn->findvalue('child::path')); + print "FILE: ($path)$name\n"; + my $f = ($path) ? "$path/$name" : "$name"; + $files{$f} = $name; + } + + # + # archive-storage-date + # + my $stordatenode = ($rootnode->find('child::archive-storage-date'))->get_node(1); + if ($stordatenode) { + print "WARNING: archive storage date exists! Resource already archived?\n"; + $warncnt++; + # delete old date + $stordatenode->removeChildNodes; + } else { + # create new storage date node + $stordatenode = $rootnode->addNewChild($namespace, "archive-storage-date"); + # move after archive-path + $rootnode->insertAfter($stordatenode, ($rootnode->find('child::archive-path'))->get_node(1)); + } + $stordatenode->appendTextNode($archdate); + $xml_changed++; + return \%files; +} + + +# +# $%files = run_archive +# +# runs the archiver program on $docdir and returns a list of archived files +# +sub run_archive { + my %files; + print LOG "START archiver $version $archdate\n"; + my $archcmd = $archprog; + $archcmd .= " archive -archsymlinkasfile=no -subdir=yes"; + $archcmd .= " -description='$archname'"; + $archcmd .= " '$docdir/'"; + + print LOG "CMD: $archcmd\n"; + if (open ARCH, "$archcmd 2>&1 |") { + while (<ARCH>) { + chomp; + print LOG "ARCH: $_\n"; + if (/Normal File-->\s+[\d,]+\s+(.*)\s+\[Sent\]/) { + print " ARCH: file '$1'\n"; + $files{$1} = "ok"; + } + if (/^Archive processing of .* finished without failure./) { + print " ARCH: OK\n"; + } + } + } else { + print "ABORT: unable to start archive command '$archcmd'!!\n"; + exit 1; + } + + return \%files; +} + + +# +# check_files(\%files_to_archive, \%archived_files) +# +# compares the list of archived and to be archived files +# +sub check_files { + my ($to_archive, $archived) = @_; + + my $nt = scalar keys %$to_archive; + my $na = scalar keys %$archived; + + foreach my $ft (sort keys %$to_archive) { + my $fp = "$docdir/$ft"; + #print " fp: $fp\n"; + if ($$archived{$fp}) { + print "DEBUG: $ft archived OK\n"; + $$archived{$fp} = "OK"; + } else { + print "ERROR: file '$ft' missing from archive!\n"; + $errcnt++; + } + } + + foreach my $fa (sort keys %$archived) { + if ($$archived{$fa} ne "OK") { + my ($fn, $fp) = MPIWGStor::split_file_path($fa); + if ($MPIWGStor::index_files{$fn}) { + print "DEBUG: $fa ignored\n"; + $na--; + } else { + print "WARNING: $fa archived but not in list!\n"; + $warncnt++; + } + } + } + + if ($nt > $na) { + print "WARNING: less files were archived ($na vs. $nt)!\n"; + } elsif ($na > $nt) { + print "WARNING: more files were archived ($na vs. $nt)!\n"; + } + +} + + +# +# delete_files(\%files) +# +# deletes the files from the list (of absolute files) and their directories +# if they are empty +# +sub delete_files { + my ($files) = @_; + my %dirs; + + foreach my $f (sort keys %$files) { + my ($fn, $fp) = MPIWGStor::split_file_path($f); + # collect all unique directories + if ($fp && (! $dirs{$fp})) { + $dirs{$fp} = $fp; + } + # don't delete index files + next if ($MPIWGStor::index_files{$fn}); + # no file no delete + next unless (-f $f); + # delete files + if (unlink $f) { + print "INFO: remove $f ($fn)\n"; + } else { + print "ERROR: unable to delete $f!\n"; + $errcnt++; + } + } + # try to delete all empty directories + my @dirkeys = sort keys %dirs; + # starting at the end to get to the subdirectories first + for (my $i = $#dirkeys; $i >= 0; $i--) { + my $d = $dirkeys[$i]; + # dont't remove document dir (shouldn't be empty anyway) + next if ($d eq $docdir); + if (-d $d) { + print "INFO: remove dir $d\n"; + rmdir $d; + } + } +} + + +# +# delete_all_files(\%files, $dir) +# +# deletes all files with names from the list %files +# in the directory $dir and its subdirectories +# +sub delete_all_files { + my ($files, $dir) = @_; + + if (! opendir DIR, $dir) { + print "ERROR: unable to read directory $dir!\n"; + $errcnt++; + return; + } + my @fl = readdir DIR; + closedir DIR; + + foreach my $f (@fl) { + next if ($f =~ /^\.{1,2}$/); + if ($$files{$f}) { + # $f is in the file list + if (-f "$dir/$f") { + # $f is a file + if (unlink "$dir/$f") { + print "INFO: removed $f\n"; + } else { + print "ERROR: unable to delete $f!\n"; + $errcnt++; + } + } elsif (-d _) { + # $f is a directory (unlink won't work) + if ((system 'rm', '-r', "$dir/$f") == 0) { + print "INFO: removed directory $f\n"; + } else { + print "ERROR: unable to delete directory $f!\n"; + $errcnt++; + } + } else { + print "ERROR: funny object $dir/$f!\n"; + $errcnt++; + } + } else { + # $f is not in the list + if (-d "$dir/$f") { + # recurse into directories + print "DEBUG: enter $dir/$f\n"; + delete_all_files($files, "$dir/$f"); + } + } + } +} + + +####################################################### +# main +# + +print "START: archiver $version at $archdate\n"; + +# make shure the right user is running this program +my $user = getlogin; +#if (($user ne "archive")||($user ne "root")) { +# logger("ABORT", "you ($user) must be archive or root user to run this program!"); +# exit 1; +#} + +# use metacheck first +if (system("$checkprog $docdir >/dev/null") == 0) { + print "INFO: resource '$docdir' check OK\n"; +} else { + print "ABORT: resource '$docdir' check failed!!\n"; + exit 1; +} + +# read index.meta file +my ($document, $rootnode) = MPIWGStor::read_xml($metafile); + +# check file and add archive date +my $files_to_archive = read_resource_meta($rootnode); + +print "INFO: ", scalar keys %$files_to_archive, " files to archive\n"; + +# check for .archived file +if (-f "$docdir/.archived") { + if (unlink "$docdir/.archived") { + print "WARNING: existing .archived file has been removed! Resource already archived?\n"; + $warncnt++; + } else { + print "ERROR: unable to remove existing .archived file!\n"; + $errcnt++; + } +} + +# remove junk files +if ($delete_junk_files) { + delete_all_files(\%MPIWGStor::junk_files, $docdir); +} + +# write new index.meta +if ($errcnt > 0) { + print "ABORT: there were errors!\n"; + exit 1; +} else { + if ($fix_xml) { + MPIWGStor::write_xml($document, $metafile); + } +} + +# start archiving +my $archived_files = run_archive; + +print "INFO: ", scalar keys %$archived_files, " files archived\n"; + +# check list of archived files +check_files($files_to_archive, $archived_files); + +# delete files if all went OK +if ($errcnt == 0) { + system("touch", "$docdir/.archived"); + # remove junk files (again) + if ($delete_junk_files) { + delete_all_files(\%MPIWGStor::junk_files, $docdir); + } + # remove archived files + if ($delete_data_files) { + delete_files($archived_files); + } +} + +print "INFO: $warncnt warnings\n"; +print "INFO: $errcnt errors\n"; +my $num_archfiles = %$archived_files + 1; +if ($errcnt > 0) { + print "ABORT: there were errors! ($num_archfiles files archived) at ", stime(time), "\n"; + exit 1; +} else { + print "DONE: $num_archfiles files archived at ", stime(time), "\n"; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/harvestmeta.pl Thu Jun 17 17:58:42 2004 +0200 @@ -0,0 +1,289 @@ +#!/usr/local/bin/perl -w + +use strict; +use XML::SAX; +use DBI; + +use lib '/usr/local/mpiwg/archive'; +use MPIWGStor; +use HarvestmetaHandler; + +# make output unbuffered +$|=1; + +####################################################### +# internal parameters +# + +# program version +my $version = "0.1 (08.06.2004)"; + +# read command line parameters +my $args = MPIWGStor::parseargs; + +# debug level +$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; + +# XML namespace (not really implemented!) +my $namespace = ""; + +# delete and rebuild database +my $purgeDB = (exists $$args{'purgedb'}); + +# database connection +my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", ""); +if (! $dbh) { + logger('ABORT', "unable to connect to database!"); + exit 1; +} +$dbh->{AutoCommit} = 0; +my $dbNextFileId; +my $dbNewFile; +my $dbNewMeta; +my $dbClearMeta; +my $dbFindFileName; +my $dbFindFilePath; +my $dbClearFile; +my $dbFindFileFlag; +my $dbFindFileFlagPath; +my $dbSetFileFlag; +my $dbClearAllFileFlag; + +####################################################### +# check parameters that were passed to the program +# +my $basedir = $$args{'path'}; +if (! $basedir) { + logger("ABORT", "no document directory given!"); + exit 1; +} +# strip trailing slashes +$basedir =~ s/\/$//; +if (! -d $basedir) { + logger("ABORT", "document directory \'$basedir\' doesn't exist!"); + exit 1; +} + +my $metaParserHandler = HarvestmetaHandler->new; +my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler); + +####################################################### +# internal variables +# + +# number of errors +my $errcnt = 0; +# number of warnings +my $warncnt = 0; + +# number of files on fs +my $fcnt = 0; +# number of index files +my $idxcnt = 0; + +####################################################### +# subroutines +# + +# +# readAllFiles($realdir, $basedir, \%files, \%dirs) +# +# reads all files and directories below $realdir and puts the +# files in %files and directories in %dirs +# $basedir is only for recursion, it should be empty when called +# from outside +# +sub readAllFiles { + my ($directory, $basedir) = @_; + my $cnt = 0; + + if (! opendir DIR, $directory) { + return 0; + } + my @dirfiles = readdir DIR; + foreach my $fn (@dirfiles) { + # ignore names starting with a dot + next if ($fn =~ /^\./); + # ignore other silly files + next if ($junk_files{$fn}); + + $cnt++; + $fcnt++; + my $f = "$directory/$fn"; + my $docf = ($basedir) ? "$basedir/$fn" : $fn; + #logger('DEBUG', "fs_file: \"$f\""); + if (-f $f) { + #logger(" is file"); + if ($fn eq "index.meta") { + harvestFile($fn, $directory); + } + } elsif (-d _) { + #logger(" is dir"); + # recurse into directory + $cnt += readAllFiles($f, $docf); + } + } + return $cnt; +} + +# +# cleanUnmarkedFiles($basepath) +# +# deletes all unflagged file and meta entries. +# +sub cleanUnmarkedFiles { + my ($basepath) = @_; + my $rv = $dbFindFileFlagPath->execute("${basepath}%"); + my $ids = $dbFindFileFlagPath->fetchall_arrayref; + for my $i (@$ids) { + my $id = $$i[0]; + logger('DEBUG', "cleaning file and meta of id: $id"); + $dbClearMeta->execute($id); + $dbClearFile->execute($id); + $dbh->commit; + } +} + +# +# harvestFile($filename, $filepath) +# +# reads the index file $filename at $filepath and puts the contents +# in the database. +# +sub harvestFile { + my ($filename, $filepath) = @_; + logger('DEBUG', "looking at file '$filename' at '$filepath'"); + # get file time + my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, + $atime,$mtime,$ctime,$blksize,$blocks) + = stat("$filepath/$filename"); + my $filetime = stime($mtime); + # register file in db + my $fid = registerFile("$filepath/$filename", $filetime); + if ($fid) { + # file is new/modified + # parse index file + $metaParser->parse_uri("$filepath/$filename"); + my @data = $metaParserHandler->getData(); + logger('DEBUG', "parsed $#data+1 elements"); + registerMeta($fid, @data); + } + $idxcnt++; + logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ; +} + +# +# $fileid = registerFile($filepath, $filetime) +# +# returns the file ID for the file $filepath. If necessary it +# will be added to the database. returns 0 if an update is not necessary. +# +sub registerFile { + my ($filepath, $filetime) = @_; + my $fileid = 0; + # look if file is in db + my $rv = $dbFindFileName->execute($filepath); + my $mtime; + ($fileid, $mtime) = $dbFindFileName->fetchrow_array; + if ($fileid) { + # file is in db + # update flag + $dbSetFileFlag->execute($fileid, 1); + $dbh->commit; + my $stime = s2stime($mtime); + if ($stime ge $filetime) { + # if its current return 0 + logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')"); + return 0; + } else { + logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')"); + } + } + if (! $fileid) { + # get a new file id + my $rv = $dbNextFileId->execute; + ($fileid) = $dbNextFileId->fetchrow_array; + logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime"); + $dbNewFile->execute($fileid, $filepath, $filetime); + # update flag + $dbSetFileFlag->execute($fileid, 1); + $dbh->commit; + } + return $fileid; +} + +# +# registerMeta($fileid, @meta) +# +# adds the metadata information @meta for $fileid to the database. +# +sub registerMeta { + my ($fileid, @meta) = @_; + logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)"); + my $idx = 0; + foreach my $keyval (@meta) { + #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]"); + $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]); + } + $dbh->commit; + logger('INFO', "added $idx elements (file $fileid)"); +} + +# +# initdb() +# +# initialises the database connection. +# +sub initDB { + my $rv; + # clean tables + if ($purgeDB) { + $rv = $dbh->do("delete from files"); + $rv = $dbh->do("delete from meta"); + if ($dbh->err) { + logger('ABORT', "unable to clean table!"); + exit 1; + } + $dbh->commit; + } + + # clear flags + $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )"); + $dbh->commit; + + # prepare statements + $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')"); + $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)"); + $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?"); + $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?"); + $dbClearFile = $dbh->prepare("delete from files where id=?"); + $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?"); + $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null"); + $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)"); + $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)"); + $dbClearMeta = $dbh->prepare("delete from meta where fileid=?"); + +} + +####################################################### +# main +# + +logger("INFO", "harvestmeta $version"); + +initDB(); + +# read and process all files under $basedir +my $fnum = readAllFiles($basedir, ""); +# delete orphaned data (under $basedir) +cleanUnmarkedFiles($basedir); + +logger("INFO", "analysed $idxcnt of $fnum files!"); +logger("INFO", "$warncnt warnings"); +logger("INFO", "$errcnt errors"); +if ($errcnt > 0) { + logger("ABORT", "there were errors!"); + exit 1; +} else { + logger("DONE", "all index files read successfully!"); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metacheck.pl Thu Jun 17 17:58:42 2004 +0200 @@ -0,0 +1,414 @@ +#!/usr/local/bin/perl -w + +use strict; +use XML::LibXML; + +use lib '/usr/local/mpiwg/archive'; +use MPIWGStor; + +# make output unbuffered +$|=1; + +####################################################### +# internal parameters +# + +# program version +my $version = "0.5.2 (7.1.2004)"; + +# read command line parameters +my $args = MPIWGStor::parseargs; + +# debug level +$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; + +# check only or fix index file also +my $check_only = (exists $$args{'checkonly'}) ? $$args{'checkonly'} : 0; +# add file tags for missing files +my $fix_files = ! $check_only; +# add tags for file size and date +my $fix_fs_meta = 1; +# add dir tags for missing directories +my $fix_dirs = ! $check_only; +# rewrite XML file (necessary for fix_files and fix_dirs) +my $fix_xml = ! $check_only; +my $xml_changed = 0; +# XML namespace (not really implemented!) +my $namespace = ""; + + +####################################################### +# check parameters that were passed to the program +# +my $docdir = $$args{'path'}; +if (! $docdir) { + logger("ABORT", "no document directory given!"); + exit 1; +} +# strip trailing slashes +$docdir =~ s/\/$//; +if (! -d $docdir) { + logger("ABORT", "document directory \'$docdir\' doesn't exist!"); + exit 1; +} + +my $metafile = "$docdir/index.meta"; +if (! -f $metafile) { + logger("ABORT", "metadata index file \'$metafile\' doesn't exist!"); + exit 1; +} + +####################################################### +# internal variables +# + +# all files in the document directory tree +my %files; +# all directories in the document directory tree +my %dirs; +# number of errors +my $errcnt = 0; +# number of warnings +my $warncnt = 0; + +####################################################### +# subroutines +# + +# +# fs_read_files($realdir, $docdir, \%files, \%dirs) +# +# reads all files and directories below $realdir and puts the +# files in %files and directories in %dirs +# $docdir is only for recursion, it should be empty when called +# from outside +# +sub fs_read_files { + my ($directory, $docdir, $files, $dirs) = @_; + my $cnt = 0; + + if (! opendir DIR, $directory) { + return 0; + } + my @dirfiles = readdir DIR; + foreach my $fn (@dirfiles) { + # ignore names starting with a dot + next if ($fn =~ /^\./); + # ignore other silly files + next if ($junk_files{$fn}); + + $cnt++; + my $f = "$directory/$fn"; + my $docf = ($docdir) ? "$docdir/$fn" : $fn; + #logger("fs_file: \"$f\""); + if (-f $f) { + #logger(" is file"); + my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, + $atime,$mtime,$ctime,$blksize,$blocks) + = stat(_); + $$files{$docf} = [$fn, $size, $mtime]; + } elsif (-d _) { + #logger(" is dir"); + $$dirs{$docf} = $fn; + # recurse into directory + $cnt += fs_read_files($f, $docf, $files, $dirs); + } + } + return $cnt; +} + + +# +# check_dirs($rootnode, \%dirs) +# +# reads all dir elements under $rootnode and compares with the directory list +# in %dirs +# +sub check_dirs { + my ($root, $fsdirs) = @_; + + # + # iterate all dir tags + # + my @dirnodes = $root->findnodes('child::dir'); + my %okdirs; + foreach my $dirnode (@dirnodes) { + my $dirname = sstrip($dirnode->find('child::name')); + my $dirpath = sstrip($dirnode->find('child::path')); + my $description = sstrip($dirnode->find('child::description')); + # name must be valid + if (! valid_dir_name($dirname)) { + logger("ERROR", "directory name ($dirpath) $dirname invalid!"); + $errcnt++; + } + # description can be present + if (! $description) { + logger("WARNING", "description for directory $dirname (in $dirpath/) missing!"); + $warncnt++; + } + # check with dirs on filesystem + my $fn; + if ($dirpath) { + $fn = "$dirpath/$dirname"; + } else { + $fn = "$dirname"; + } + #logger("dir: \"$dirname\", \"$dirpath\""); + if ($$fsdirs{$fn}) { + #logger(" OK ($$fsdirs{$fn})"); + $okdirs{$fn} = $dirname; + } else { + logger("ERROR", "directory $dirname (in $dirpath/) missing on disk!"); + $errcnt++; + } + } + #logger("dirs: ", (scalar keys %$fsdirs), " vs ", (scalar keys %okdirs), ""); + if ((scalar keys %$fsdirs) != (scalar keys %okdirs)) { + # number of dir tags and dirs don't match + # iterate through all dirs + foreach my $f (sort keys %$fsdirs) { + # was this dir missing? + if (! $okdirs{$f}) { + my ($name, $path) = split_file_path($f); + # name must be valid + if (! valid_dir_name($name)) { + $path = "." unless ($path); + logger("ERROR", "directory name $name (in $path/) invalid!"); + $errcnt++; + next; + } + if ($fix_dirs) { + # add missing dir tag + my $dir_node = $root->addNewChild($namespace, "dir"); + $xml_changed++; + # add name + my $name_node = $dir_node->addNewChild($namespace, "name"); + $name_node->appendTextNode($name); + # add path + if ($path) { + my $path_node = $dir_node->addNewChild($namespace, "path"); + $path_node->appendTextNode($path); + } + logger("INFO", "directory $f to be added to index file!"); + } else { + logger("ERROR", "directory $f missing in index file!"); + $errcnt++; + } + } + } + } +} + + +# +# check_files($rootnode, \%files) +# +# reads all file elements under $rootnode and compares with the file list +# in %files +# +sub check_files { + my ($root, $fsfiles) = @_; + + # + # iterate all file tags + # + my @filenodes = $root->findnodes('child::file'); + my %okfiles; + foreach my $filenode (@filenodes) { + my $filename = sstrip($filenode->find('child::name')); + my $filepath = sstrip($filenode->find('child::path')); + my $filesize = sstrip($filenode->find('child::size')); + my $filedate = sstrip($filenode->find('child::date')); + # name must be valid + if (! valid_file_name($filename)) { + logger("ERROR", "file name ($filepath)$filename invalid!"); + $errcnt++; + } + my $fn = ($filepath) ? "$filepath/$filename" : "$filename"; + #logger("file: \"$filename\", \"$filepath\""); + if ($$fsfiles{$fn}) { + #logger(" OK ($$fsfiles{$fn})"); + $okfiles{$fn} = $filename; + # check file size and date + if ($filesize) { + if ($filesize != $$fsfiles{$fn}->[1]) { + logger("WARNING", "size of file $fn changed: $filesize to $$fsfiles{$fn}->[1]"); + $warncnt++; + } + } + # file date + if ($filedate) { + if ($filedate ne stime($$fsfiles{$fn}->[2])) { + logger("WARNING", "date of file $fn changed: $filedate to ", stime($$fsfiles{$fn}->[2]), ""); + $warncnt++; + } + } + # update file size and date + if ($fix_fs_meta) { + # delete size and date + foreach my $n ($filenode->findnodes('child::size')) { + $filenode->removeChild($n); + } + foreach my $n ($filenode->findnodes('child::date')) { + $filenode->removeChild($n); + } + # add new size and date + my $node = $filenode->addNewChild($namespace, "size"); + $node->appendTextNode($$fsfiles{$fn}->[1]); + $node = $filenode->addNewChild($namespace, "date"); + $node->appendTextNode(stime($$fsfiles{$fn}->[2])); + $xml_changed++; + } + } else { + logger("ERROR", "file $filename (in $filepath/) missing on disk!"); + $errcnt++; + } + } + #logger("files: ", (scalar keys %$fsfiles), " vs ", (scalar keys %okfiles), ""); + if ((scalar keys %$fsfiles) != (scalar keys %okfiles)) { + # number of file tags and files don't match + # iterate through all files + foreach my $f (sort keys %$fsfiles) { + my ($name, $path) = split_file_path($f); + # was this file missing? + if (! $okfiles{$f}) { + # is an ignoreable file? + if ($index_files{$name}) { + next; + } + # name must be valid + if (! valid_dir_name($name)) { + $path = "." unless ($path); + logger("ERROR", "file name $name (in $path/) invalid!"); + $errcnt++; + next; + } + if ($fix_files) { + # add missing file tag + my $file_node = $root->addNewChild($namespace, "file"); + $xml_changed++; + # add name + my $name_node = $file_node->addNewChild($namespace, "name"); + $name_node->appendTextNode($name); + # add path + if ($path) { + my $path_node = $file_node->addNewChild($namespace, "path"); + $path_node->appendTextNode($path); + } + # add size + my $size_node = $file_node->addNewChild($namespace, "size"); + $size_node->appendTextNode($$fsfiles{$f}->[1]); + # add date + my $date_node = $file_node->addNewChild($namespace, "date"); + $date_node->appendTextNode(stime($$fsfiles{$f}->[2])); + logger("INFO", "file $f to be added to index file!"); + } else { + logger("ERROR", "file $f missing in index file!"); + $errcnt++; + } + } + } + } +} + +# +# check_resource_meta($rootnode) +# +# checks general resource meta information +# +sub check_resource_meta { + my ($rootnode) = @_; + + # + # description + # + my $description = $rootnode->findvalue('child::description'); + if (! $description) { + logger("ERROR", "resource description element missing!"); + $errcnt++; + } + # + # name + # + my $name = sstrip($rootnode->findvalue('child::name')); + if ($name) { + my ($dirname, $dirpath) = split_file_path($docdir); + if ($dirname ne $name) { + logger("ERROR", "resource name element '$name' does not match directory name '$dirname'!"); + $errcnt++; + } + } else { + logger("ERROR", "resource name element missing!"); + $errcnt++; + } + # + # archive path + # + my $realpath; + # get real path + if ($docdir =~ /^\//) { + # docdir is absolute + $realpath = $docdir; + } else { + # docdir is relative -- try with the shell + if (open PWDCMD, "cd $docdir ; pwd|") { + $realpath = <PWDCMD>; + chomp $realpath; + } + } + if (! $realpath) { + logger("ERROR", "unable to check real archive path!"); + $errcnt++; + return; + } + # get archive-path + my $archnode = ($rootnode->find('child::archive-path'))->get_node(1); + if ($archnode) { + my $arch = sstrip($archnode->textContent); + if ($arch ne $realpath) { + logger("WARNING", "incorrect archive-path '$arch' will be changed to '$realpath'!"); + $warncnt++; + # correct archive-path + $archnode->removeChildNodes; + $archnode->appendTextNode($realpath); + $xml_changed++; + } + } else { + # add archive-path + $archnode = $rootnode->addNewChild($namespace, "archive-path"); + $archnode->appendTextNode($realpath); + $xml_changed++; + } + +} + + + +####################################################### +# main +# + +logger("INFO", "metacheck $version"); + +my ($document, $rootnode) = read_xml($metafile); + +check_resource_meta($rootnode); + +my $fnum = fs_read_files($docdir, "", \%files, \%dirs); +logger("INFO", "$fnum files on FS"); +#foreach (keys %files) {logger(" file ($_): $files{$_}");} + +check_files($rootnode, \%files); +check_dirs($rootnode, \%dirs); + +logger("INFO", "$warncnt warnings"); +logger("INFO", "$errcnt errors"); +if ($errcnt > 0) { + logger("ABORT", "there were errors!"); + exit 1; +} else { + if ($fix_xml) { + write_xml($document, $metafile); + } + logger("DONE", "index file checked successfully!"); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/unarchiver.pl Thu Jun 17 17:58:42 2004 +0200 @@ -0,0 +1,327 @@ +#!/usr/local/bin/perl -w + +use strict; + +use XML::LibXML; + +# MPIWG libraries +use lib '/usr/local/mpiwg/archive'; +use MPIWGStor; + +# make output unbuffered +$|=1; + + +####################################################### +# internal parameters +# + +# program version +my $version = "0.1 (24.9.2003)"; + +# read command line parameters +my $args = parseargs; + +# debug level +$debug = (exists $$args{'debug'}) ? ($$args{'debug'}) : 0; + +# rewrite XML file (necessary for archive date!) +my $fix_xml = 1; +my $xml_changed = 0; +# XML namespace (not really implemented!) +my $namespace = ""; + +# archive name (archive-path element, usually == $docdir) +my $archname; +# archive storage date +my $archdate; + +####################################################### +# external programs +# +my $archprog = "/opt/tivoli/tsm/client/ba/bin/dsmc"; +if (! -x $archprog) { + logger("ABORT", "TSM client program '$archprog' missing!!"); + exit 1; +} +my $checkprog = "/usr/local/mpiwg/archive/archivecheck"; +if (! -x $checkprog) { + logger("ABORT", "archive checking program '$checkprog' missing!!"); + exit 1; +} +# log file for archiver +my $log_file = "/var/tmp/unarchiver.log"; +if (! open LOG, ">>$log_file") { + logger("ABORT", "unable to write log file '$log_file'!!"); + exit 1; +} + +####################################################### +# check parameters that were passed to the program +# +my $docdir = $$args{'path'}; +if (! $docdir) { + print "ABORT: no document directory given!\n"; + exit 1; +} +# strip trailing slashes +$docdir =~ s/\/$//; +if (! -d $docdir) { + print "ABORT: document directory \'$docdir\' doesn't exist!\n"; + exit 1; +} + +my $metafile = "$docdir/index.meta"; +if (! -f $metafile) { + print "ABORT: metadata index file \'$metafile\' doesn't exist!\n"; + exit 1; +} + + +####################################################### +# internal variables +# + +# number of errors +my $errcnt = 0; +# number of warnings +my $warncnt = 0; + +####################################################### +# subroutines +# + + +# +# $files = read_resource_meta($rootnode) +# +# checks general resource meta information and reads the list of files +# +sub read_resource_meta { + my ($rootnode) = @_; + my %files; + # + # archive path + # + # get archive-path + $archname = sstrip($rootnode->findvalue('child::archive-path')); + if (! $archname) { + logger("ABORT", "archive-name element missing!!"); + exit 1; + } + + # + # files + # + my @filenodes = $rootnode->findnodes('child::file'); + foreach my $fn (@filenodes) { + my $name = sstrip($fn->findvalue('child::name')); + my $path = sstrip($fn->findvalue('child::path')); + logger("DEBUG", "FILE: ($path)$name"); + my $f = ($path) ? "$path/$name" : "$name"; + $files{$f} = [$name]; + } + + # + # dirs + # + my @dirnodes = $rootnode->findnodes('child::dir'); + foreach my $fn (@dirnodes) { + my $name = sstrip($fn->findvalue('child::name')); + my $path = sstrip($fn->findvalue('child::path')); + logger("DEBUG", "DIR: ($path)$name"); + my $f = ($path) ? "$path/$name" : "$name"; + $files{$f} = [$name]; + } + + # + # archive-storage-date + # + my $archdate = $rootnode->find('child::archive-storage-date'); + if ($archdate) { + logger("INFO", "archive storage date: $archdate"); + } else { + logger("ERROR", "archive storage date missing!"); + $errcnt++; + } + + # + # archive-recall-date + # + my $recalldatenode = ($rootnode->find('child::archive-recall-date'))->get_node(1); + if ($recalldatenode) { + print "INFO: archive recall date exists!\n"; + # delete old date + $recalldatenode->removeChildNodes; + } else { + # create new storage date node + $recalldatenode = $rootnode->addNewChild($namespace, "archive-recall-date"); + # move after archive-path + $rootnode->insertAfter($recalldatenode, ($rootnode->find('child::archive-storage-date'))->get_node(1)); + } + $recalldatenode->appendTextNode(scalar localtime); + $xml_changed++; + + return \%files; +} + + +# +# $%files = run_retrieve +# +# runs the retriever program on $docdir and returns a list of archived files +# +# Sample output: +# Retrieving 17,234 /mpiwg/archive/data/test/auto_titit_123/pageimg/essen-wind1.jpg [Done] +# +sub run_retrieve { + my %files; + print LOG "START unarchive $version ", scalar localtime, "\n"; + my $archcmd = $archprog; + $archcmd .= " retrieve -subdir=yes -replace=all"; + $archcmd .= " -description='$archname'"; + $archcmd .= " '$docdir/'"; + + my $archcnt = 0; + print LOG "CMD: $archcmd\n"; + if (open ARCH, "$archcmd 2>&1 |") { + while (<ARCH>) { + chomp; + print LOG "ARCH: $_\n"; + if (/ + Retrieving + \s+([\d,]+) # size + \s+(\S+) # file name + \s+\[Done\] + /x) { + my $size = $1; + my $file = $2; + $size =~ s/,//g; + logger("DEBUG", " RETRIEVE: file '$file'"); + $archcnt++; + if ($files{$file}) { + logger("WARNING", "file $file seems to be archived multiple times."); + $warncnt++; + } + $files{$file} = [$size]; + } + } + logger("INFO", "$archcnt archives of " . (scalar keys %files) . " files."); + } else { + logger("ABORT", "unable to start archive command '$archcmd'!!"); + exit 1; + } + return \%files; +} + + +# +# check_files(\%files_to_retrieve, \%retrieved_files) +# +# compares the list of archived and retrieved files +# +sub check_files { + my ($to_retrieve, $retrieved) = @_; + + my $nt = scalar keys %$to_retrieve; + my $na = scalar keys %$retrieved; + + foreach my $ft (sort keys %$to_retrieve) { + my $fp = "$docdir/$ft"; + #logger("DEBUG", " fp: $fp"); + if ($$retrieved{$fp}) { + logger("DEBUG", "$ft retrieved OK"); + $$retrieved{$fp}->[1] = "OK"; + } else { + logger("ERROR", "file entry '$ft' missing from archive!"); + $errcnt++; + } + } + + foreach my $fa (sort keys %$retrieved) { + if (! $$retrieved{$fa}->[1]) { + my ($fn, $fp) = split_file_path($fa); + if ($index_files{$fn}) { + logger("DEBUG", "$fa ignored"); + $na--; + } else { + logger("WARNING", "$fa retrieved but not in list!"); + $warncnt++; + } + } + } + + if ($nt > $na) { + logger("WARNING", "less files were retrieved ($na vs. $nt)!"); + $warncnt++; + } elsif ($na > $nt) { + logger("WARNING", "more files were retrieved ($na vs. $nt)!"); + $warncnt++; + } + +} + + + +####################################################### +# main +# + +logger("INFO", "unarchiver $version"); + +# make shure the right user is running this program +my $user = getlogin; +if (($user ne "archive")&&($user ne "root")) { + logger("ABORT", "you must be archive or root user to run this program!"); + exit 1; +} + +# use checkarchive first +if (system("$checkprog $docdir >/dev/null") == 0) { + logger("INFO", "archive '$docdir' check OK"); +} else { + logger("ABORT", "archive '$docdir' check failed!!"); + exit 1; +} + +# read index.meta file +my ($document, $rootnode) = read_xml($metafile); + +# check index file +my $archived_files = read_resource_meta($rootnode); +my $num_archived_files = scalar keys %$archived_files; + +# check for .archived file +if (-f "$docdir/.archived") { + logger("INFO", ".archived file exists."); +} else { + logger("WARNING", "no .archived file!"); + $warncnt++; +} + +logger("INFO", "$num_archived_files files to retrieve."); + +# retrieve +my $retrieved_files = run_retrieve; + +my $num_arch_files = (scalar keys %$retrieved_files); +if ($num_arch_files == 0) { + logger("ABORT", "no files retrieved!!"); + exit 1; +} +logger("INFO", "$num_arch_files files retrieved"); + +# check list of archived files +check_files($archived_files, $retrieved_files); + +# rewrite index.meta file +write_xml($document, $metafile); + +logger("INFO", "$warncnt warnings"); +logger("INFO", "$errcnt errors"); +if ($errcnt == 0) { + logger("DONE", "" . (scalar keys %$retrieved_files) . " archived files retrieved"); +} else { + logger("ABORT", "there were $errcnt errors!!"); + exit 1; +}