changeset 0:30497c6a3eca

Initial revision
author casties
date Thu, 17 Jun 2004 17:58:42 +0200
parents
children 9143a0c4c678 ea3972762520
files HarvestmetaHandler.pm MPIWGStor.pm archivecheck.pl archiver.pl harvestmeta.pl metacheck.pl unarchiver.pl
diffstat 7 files changed, 2197 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/HarvestmetaHandler.pm	Thu Jun 17 17:58:42 2004 +0200
@@ -0,0 +1,100 @@
+#
+# SAX handler for harvestmeta
+#
+
+package HarvestmetaHandler;
+
+use strict;
+
+use base qw(XML::SAX::Base);
+
+use lib '/usr/local/mpiwg/archive';
+use MPIWGStor;
+
+my @currElemPath;
+my $currElem;
+my $currText;
+my $currAttrib;
+my @elements;
+  
+sub getData {
+    return @elements;
+}
+
+sub start_document {
+    my ($self, $doc) = @_;
+    # process document start event
+    #logger('DEBUG', "startdoc: $self, $doc");
+    @currElemPath = ();
+    $currElem = "";
+    $currText = "";
+    $currAttrib ="";
+    @elements = ();
+}
+
+sub start_element {
+    my ($self, $el) = @_;
+    # process element start event
+    #logger('DEBUG', "startelem: $self, $el");
+    # check if the last element needs to be finished
+    if ($currElem) {
+	my $elem = join "/", @currElemPath;
+	push @elements, [$elem, "", $currAttrib];
+    }
+    # element name is either LocalName or Name
+    my $name = $$el{'LocalName'};
+    $name = $$el{'Name'} unless ($name);
+    #logger('DEBUG', "  name: $name");
+    # assemble attributes string
+    $currAttrib ="";
+    foreach $a (values %{$$el{'Attributes'}}) {
+	my $n = $$a{'LocalName'};
+	$n = $$a{'Name'} unless ($n);
+	my $v = $$a{'Value'};
+	$currAttrib .= "$n=\"$v\" ";
+    }
+    # start element name
+    push @currElemPath, $name;
+    $currElem = $name;
+    $currText = "";
+}
+
+sub end_element {
+    my ($self, $el) = @_;
+    # process element end event
+    #logger('DEBUG', "endelem: $self, $el");
+    # check element name
+    my $name = $$el{'LocalName'};
+    $name = $$el{'Name'} unless ($name);
+    my $lastag = $currElemPath[$#currElemPath];
+    if ($lastag ne $name) {
+	logger('ERROR', "closing tag '$lastag' doesn't match '$name'!");
+    }
+    # assemble element path
+    my $elem = join "/", @currElemPath;
+    # strip whitespace from element content
+    $currText =~ s/^\s*//;
+    $currText =~ s/\s*$//;
+    if ($currText) {
+	# put pair in elements array
+	push @elements, [$elem, $currText, $currAttrib];
+	#logger('DEBUG', "  elem: $elem = $currText");
+    }
+    # end element name
+    pop @currElemPath;
+    $currElem = "";
+    $currText = "";
+    $currAttrib ="";
+}
+
+sub characters {
+    my ($self, $char) = @_;
+    # process character data event
+    #logger('DEBUG', "characters: $self, $char");
+    # add to current content
+    $currText .= $$char{'Data'};
+    #logger('DEBUG', " Text: $currText");
+}
+
+
+1;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MPIWGStor.pm	Thu Jun 17 17:58:42 2004 +0200
@@ -0,0 +1,269 @@
+#!/usr/local/bin/perl -w
+
+#
+# Library with definitions and common routines for MPIWG storage system
+# scripts
+#
+
+package MPIWGStor;
+
+use strict;
+use vars qw(@ISA @EXPORT $VERSION
+	    $debug %junk_files %index_files $index_file_perm $archived_index_file_perm 
+	    $file_perm);
+use Exporter;
+
+$VERSION = 0.3; #  ROC 24.9.2003
+
+@ISA = qw(Exporter);
+
+@EXPORT = qw($debug %junk_files %index_files $index_file_perm $archived_index_file_perm 
+	     $file_perm
+	     &parseargs &logger &stime  &s2stime &ymd_date &split_file_path &sstrip 
+	     &valid_file_name &valid_dir_name &read_xml &write_xml);
+
+# debug level
+$debug = 0;
+
+# junk files
+%junk_files = (
+		  '.HSResource' => '.HSResource',
+		  '.HSancillary' => '.HSancillary',
+		  '.HSicon' => '.HSicon',
+		  'Network Trash Folder' => 'Network Trash Folder',
+		  'TheVolumeSettingsFolder' => 'TheVolumeSettingsFolder',
+		  '.DS_Store' => '.DS_Store'
+		  );
+
+# filenames to not delete (but archive)
+%index_files = ( 
+		     'index.meta' => 'index.meta',
+		     'index.meta.old' => 'index.meta.old'
+		     );
+
+# default permissions for normal index files (rw-rw-r--)
+$index_file_perm = 0664;
+# default permissions for archived index files (rw-r--r--)
+$archived_index_file_perm = 0644;
+# default permissions for other files (rw-rw-r--)
+$file_perm = 0664;
+
+
+#
+# $param_hash = parseargs;
+#
+# reads @ARGV and returns a hash with all options like "-option=value"
+# and the last part of the string as "path"
+#
+sub parseargs {
+    my %opts;
+    foreach my $s (@ARGV) {
+	if ($s =~ /^-([^=]+)=*(.*)$/) {
+	    $opts{$1} = ($2) ? $2 : $1;
+	} else {
+	    $opts{'path'} = $s;
+	}
+    }
+    return \%opts;
+}
+
+
+#
+# logger($level, $message)
+#
+# logs the $message (mostly to stdout)
+#
+sub logger {
+    my ($level, $message) = @_;
+    if ($debug || ($level ne "DEBUG")) {
+	print "$level: $message\n";
+    }
+}
+
+#
+# $stime = stime($utime)
+#
+# format utime (seconds since epoch) into string 
+# representation: "YYYY/MM/DD HH:MM:SS"
+#
+sub stime {
+    my ($utime) = @_;
+    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
+	localtime($utime);
+    my $yy = $year + 1900;
+    my $mm = $mon + 1;
+    my $stime = sprintf("%d/%02d/%02d %02d:%02d:%02d", 
+			$yy, $mm, $mday, $hour, $min, $sec);
+    return $stime;
+}
+
+#
+# $stime = s2stime($stime2)
+#
+# format db-like time (2003-09-19 12:43:32+02) into string 
+# representation: "YYYY/MM/DD HH:MM:SS"
+#
+sub s2stime {
+    my ($s2time) = @_;
+    my $stime = "";
+    if ($s2time =~ /(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)/) {
+	my ($year,$mon,$mday,$hour,$min,$sec) = ($1, $2, $3, $4, $5, $6);
+	$stime = sprintf("%d/%02d/%02d %02d:%02d:%02d", 
+			$year, $mon, $mday, $hour, $min, $sec);
+    }
+    return $stime;
+}
+
+#
+# $date = ymd_date($olddate)
+#
+# converts a date string in mm/dd/yy format to yyyy/mm/dd format
+#
+sub ymd_date {
+    my ($mdy) = @_;
+    my $ydm;
+    if ($mdy =~ /(\d+)\/(\d+)\/(\d+)/) {
+	my $m = $1;
+	my $d = $2;
+	my $y = $3;
+	# extend yy to yyyy
+	$y = ($y < 70) ? (2000 + $y) : (($y < 100) ? ($y + 1900) : $y);
+	$ydm = sprintf "%d/%02d/%02d", $y, $m, $d;
+    }
+    return $ydm;
+}
+
+#
+# ($name, $path) = split_file_path($fn)
+#
+# splits the file path $fn into
+# file name (the last path element) and path
+#
+sub split_file_path {
+    my ($fn) = @_;
+
+    if ($fn =~ /^(.*)\/([^\/]+)$/) {
+        return ($2, $1);
+    }
+    return $fn;
+}
+
+
+#
+# $name = sstrip($name)
+#
+# strips leading and trailing whitespace from $name
+#
+sub sstrip {
+    my ($name) = @_;
+    
+    if ($name =~ /\s*(.*)\s*/) {
+	return $1;
+    }
+    return $name;
+}
+
+
+#
+# $valid = valid_file_name($name)
+#
+# checks if $name is a valid file name
+#
+sub valid_file_name {
+    return valid_name($_[0], 0);
+}
+
+
+#
+# $valid = valid_dir_name($name)
+#
+# checks if $name is a valid directory name
+#
+sub valid_dir_name {
+    return valid_name($_[0], 1);
+}
+
+
+#
+# $valid = valid_name($name, $mode)
+#
+# checks if $name is a valid file ($mode=0) or directory name (mode=1)
+#
+sub valid_name {
+    my ($name, $mode) = @_;
+
+    # whitespace?
+    if ($name =~ /\s+/) {
+	return 0;
+    }
+    # invalid characters
+    if ($name !~ /^[-\w.\/]+$/) {
+	return 0;
+    }
+    # files have extension
+    if ($mode == 0) {
+	if ($name !~ /\.[-\w]+$/) {
+	    return 0;
+	}
+    }
+    return 1;
+}
+
+
+#
+# ($document, $rootnode) = read_xml($file)
+#
+# reads xml meta file $file
+# returns root node element
+#
+sub read_xml {
+    my ($file) = @_;
+    my $document;
+
+    my $parser = XML::LibXML->new();
+    if (! $parser) {
+	logger("ABORT", "unable to create parser for metadata index file!!");
+	exit 1;
+    }
+
+    logger("INFO", "index file: $file");
+    eval { $document = $parser->parse_file($file) };
+    # catch parsing errors
+    if ($@) {
+	logger("ABORT", "XML syntax error in index file!!");
+	exit 1;
+    }
+    my $rootnode = $document->documentElement;
+    logger("INFO", "root element: \'" . $rootnode->nodeName . "\'");
+
+    return ($document, $rootnode);
+}
+
+
+#
+# write_xml($document, $file)
+#
+# saves the XML tree of $document to the file $file
+#
+sub write_xml {
+    my ($document, $filename) = @_;
+    # keep backup copy
+    if (-f $filename) {
+	if (! rename $filename, "$filename.old") {
+	    logger("ABORT", "unable to change file $filename!");
+	    exit 1;
+	}
+    }
+    # write new file
+    if ($document->toFile($filename)) {
+	logger("INFO", "written new file $filename");
+	chmod $index_file_perm, $filename;
+    } else {
+	logger("ABORT", "unable to write file $filename!");
+	exit 1;
+    }
+}
+
+
+# module init
+return 1;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/archivecheck.pl	Thu Jun 17 17:58:42 2004 +0200
@@ -0,0 +1,391 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+
+use XML::LibXML;
+
+# MPIWG libraries
+use lib '/usr/local/mpiwg/archive';
+use MPIWGStor;
+
+# make output unbuffered
+$|=1;
+
+#######################################################
+# internal parameters
+#
+
+# program version
+my $version = "0.3 (24.9.2003)";
+
+# read command line parameters
+my $args = parseargs;
+
+# debug level
+$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
+
+# XML namespace (not really implemented!)
+my $namespace = "";
+
+# archive name (archive-path element, usually == $docdir)
+my $archname;
+# archive storage date
+my $archdate;
+
+
+#######################################################
+# external programs
+#
+my $archprog = "/opt/tivoli/tsm/client/ba/bin/dsmc";
+if (! -x $archprog) {
+    logger("ABORT", "TSM client program '$archprog' missing!!");
+    exit 1;
+}
+# my $checkprog = "/usr/local/mpiwg/archive/metacheck";
+# if (! -x $checkprog) {
+#     logge("ABORT", "meta data checking program '$checkprog' missing!!");
+#     exit 1;
+# }
+# log file for archiver
+my $log_file = "/var/tmp/archivecheck.log";
+if (! open LOG, ">>$log_file") {
+    logger("ABORT", "unable to write log file '$log_file'!!");
+    exit 1;
+}
+
+#######################################################
+# check parameters that were passed to the program
+#
+my $docdir = $$args{'path'};
+if (! $docdir) {
+    print "ABORT: no document directory given!\n";
+    exit 1;
+}
+# strip trailing slashes
+$docdir =~ s/\/$//;
+if (! -d $docdir) {
+    print "ABORT: document directory \'$docdir\' doesn't exist!\n";
+    exit 1;
+}
+
+my $metafile = "$docdir/index.meta";
+if (! -f $metafile) {
+    print "ABORT: metadata index file \'$metafile\' doesn't exist!\n";
+    exit 1;
+}
+
+#######################################################
+# internal variables
+#
+
+# number of errors
+my $errcnt = 0;
+# number of warnings
+my $warncnt = 0;
+
+#######################################################
+# subroutines
+#
+
+
+#
+# $files = read_resource_meta($rootnode)
+#
+# checks general resource meta information and reads the list of files
+#
+sub read_resource_meta {
+    my ($rootnode) = @_;
+    my %files;
+    #
+    # archive path
+    #
+    # get archive-path
+    $archname = sstrip($rootnode->findvalue('child::archive-path'));
+    if (! $archname) {
+	logger("ABORT", "archive-name element missing!!");
+	exit 1;
+    }
+
+    #
+    # files
+    #
+    my @filenodes = $rootnode->findnodes('child::file');
+    foreach my $fn (@filenodes) {
+	my $name = sstrip($fn->findvalue('child::name'));
+	my $path = sstrip($fn->findvalue('child::path'));
+	logger("DEBUG", "FILE: ($path)$name");
+	my $f = ($path) ? "$path/$name" : "$name";
+	$files{$f} = [$name];
+    }
+
+    #
+    # dirs
+    #
+    my @dirnodes = $rootnode->findnodes('child::dir');
+    foreach my $fn (@dirnodes) {
+	my $name = sstrip($fn->findvalue('child::name'));
+	my $path = sstrip($fn->findvalue('child::path'));
+	logger("DEBUG", "DIR: ($path)$name");
+	my $f = ($path) ? "$path/$name" : "$name";
+	$files{$f} = [$name];
+    }
+
+    #
+    # archive-storage-date
+    #
+    my $archdate = $rootnode->find('child::archive-storage-date');
+    if ($archdate) {
+	logger("INFO", "archive storage date: $archdate");
+    } else {
+	logger("ERROR", "archive storage date missing!");
+	$errcnt++;
+    }
+    return \%files;
+}
+
+
+#
+# fs_read_files($realdir, $docdir, \%files, \%dirs)
+#
+# reads all files and directories below $realdir and puts the
+# files in %files and directories in %dirs
+# $docdir is only for recursion, it should be empty when called 
+# from outside
+#
+sub fs_read_files {
+    my ($directory, $docdir, $files, $dirs) = @_;    
+    my $cnt = 0;
+
+    if (! opendir DIR, $directory) {
+	return 0;
+    }
+    my @dirfiles = readdir DIR;
+    foreach my $fn (@dirfiles) {
+	# ignore names starting with a dot
+	next if ($fn =~ /^\./);
+	# ignore other silly files
+	next if ($junk_files{$fn});
+
+	$cnt++;
+	my $f = "$directory/$fn";
+	my $docf = ($docdir) ? "$docdir/$fn" : $fn;
+	#print "fs_file: \"$f\"\n";
+	if (-f $f) {
+	    #print "  is file\n";
+	    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+                      $atime,$mtime,$ctime,$blksize,$blocks)
+                          = stat(_); 
+	    $$files{$docf} = [$fn, $size, stime($mtime)];
+	    #logger("TEST", "fn $fn, size $size, mtime $mtime");
+	} elsif (-d _) {
+	    #print "  is dir\n";
+	    $$dirs{$docf} = $fn;
+	    # recurse into directory
+	    $cnt += fs_read_files($f, $docf, $files, $dirs);
+	}
+    }
+    return $cnt;
+}
+
+
+#
+# $%files = run_query
+#
+# runs the archiver program on $docdir and returns a list of archived files
+#
+# Sample output:
+#         20,345  B  08/06/03   17:17:02    /mpiwg/archive/data/proyectohumboldt/webb_histo_fr_01_1839/index.meta Never /mpiwg/archive/data/proyectohumboldt/webb_histo_fr_01_1839
+#
+sub run_query {
+    my %files;
+    print LOG "START checkarchive $version ", scalar localtime, "\n";
+    my $archcmd = $archprog;
+    $archcmd .= " query archive -subdir=yes";
+    $archcmd .= " -description='$archname'";
+    $archcmd .= " '$docdir/'";
+
+    my $archcnt = 0;
+    print LOG "CMD: $archcmd\n";
+    if (open ARCH, "$archcmd 2>&1 |") {
+	while (<ARCH>) {
+	    chomp;
+	    print LOG "ARCH: $_\n";
+	    if (/
+		\s*([\d,]+)    # size
+		\s+(\w+)       # unit of size
+		\s+([\d\/]+)   # date mm\/dd\/yy
+		\s+([\d:]+)    # time
+		\s+(\S+)       # file name
+		\s+(\w+)       # expiry
+		\s+(\S+)       # archive label
+		/x) {
+		my $size = $1;
+		my $sunit = $2;
+		my $date = $3;
+		my $time = $4;
+		my $file = $5;
+		my $exp = $6;
+		my $label = $7;
+		$size =~ s/,//g;
+		$date = ymd_date($date);
+		logger("DEBUG", "  QUERY: file '$file'");
+		$archcnt++;
+		if ($files{$file}) {
+		    logger("WARNING", "file $file seems to be archived multiple times: $time $date");
+		    $warncnt++;
+		} 
+		$files{$file} = [$size, "$date $time"];
+	    }
+	}
+	logger("INFO", "$archcnt archives of " . (scalar keys %files) . " files.");
+    } else {
+	logger("ABORT", "unable to start archive command '$archcmd'!!");
+	exit 1;
+    }
+
+    return \%files;
+}
+
+
+#
+# check_files(\%files_to_archive, \%archived_files)
+#
+# compares the list of archived and to be archived files
+#
+sub check_files {
+    my ($to_archive, $archived) = @_;
+
+    my $nt = scalar keys %$to_archive;
+    my $na = scalar keys %$archived;
+
+    foreach my $ft (sort keys %$to_archive) {
+	my $fp = "$docdir/$ft";
+	#logger("DEBUG", "  fp: $fp");
+	if ($$archived{$fp}) {
+	    logger("DEBUG", "$ft archived OK");
+	    $$archived{$fp}->[2] = "OK";
+	} else {
+	    logger("ERROR", "file entry '$ft' missing from archive!");
+	    $errcnt++;
+	}
+    }
+
+    foreach my $fa (sort keys %$archived) {
+	if (! $$archived{$fa}->[2]) {
+	    my ($fn, $fp) = split_file_path($fa);
+	    if ($index_files{$fn}) {
+		logger("DEBUG", "$fa ignored");
+		$na--;
+	    } else {
+		logger("WARNING", "$fa archived but not in list!");
+		$warncnt++;
+	    }
+	}
+    }
+
+    if ($nt > $na) {
+	logger("WARNING", "less files were archived ($na vs. $nt)!");
+	$warncnt++;
+    } elsif ($na > $nt) {
+	logger("WARNING", "more files were archived ($na vs. $nt)!");
+	$warncnt++;
+    }
+
+}
+
+#
+# compare_files(\%files_on_disk, \%archived_files)
+#
+# compares the list of archived files and files on disk
+#
+sub compare_files {
+    my ($fs_files, $archived) = @_;
+
+    foreach my $ft (sort keys %$fs_files) {
+	next if ($index_files{$ft});
+	my $fp = "$docdir/$ft";
+	#logger("DEBUG", "  fp: $fp");
+	if ($$archived{$fp}) {
+	    next if ($index_files{$ft});
+	    
+	    my $asize = $$archived{$fp}[0];
+	    my $atime = $$archived{$fp}[1];
+	    my $fsize = $$fs_files{$ft}[1];
+	    my $ftime = $$fs_files{$ft}[2];
+	    if ($asize != $fsize) {
+		logger("ERROR", "archived $ft ($asize) and file on disk ($fsize) have different size!");
+		$errcnt++;
+	    } elsif ($atime lt $ftime) {
+		logger("ERROR", "archived $ft ($atime) is older than file on disk ($ftime)!");
+		$errcnt++;
+	    } else {
+		logger("ERROR", "archived file $ft still on disk");
+		$errcnt++;
+	    }
+	} else {
+	    logger("ERROR", "file '$ft' on disk missing from archive!");
+	    $errcnt++;
+	}
+    }
+}
+
+
+
+#######################################################
+# main
+#
+
+logger("INFO", "archivecheck $version");
+
+# make shure the right user is running this program
+my $user = getlogin;
+if (($user ne "archive")&&($user ne "root")) {
+    logger("ABORT", "you must be archive or root user to run this program!");
+    exit 1;
+}
+
+# read index.meta file
+my ($document, $rootnode) = read_xml($metafile);
+
+# check file and add archive date
+my $files_to_archive = read_resource_meta($rootnode);
+
+# check for .archived file
+if (-f "$docdir/.archived") {
+    logger("INFO", ".archived file exists.");
+} else {
+    logger("WARNING", "no .archived file!");
+    $warncnt++;
+}
+
+# check archive
+my $archived_files = run_query;
+
+my $num_arch_files = (scalar keys %$archived_files);
+if ($num_arch_files == 0) {
+    logger("ABORT", "no archive of this directory!!");
+    exit 1;
+}
+logger("INFO", "$num_arch_files files archived");
+
+# check list of archived files
+check_files($files_to_archive, $archived_files);
+
+# read files from filesystem
+my %fsfiles;
+my %fsdirs;
+my $num_fs_files = fs_read_files($docdir, "", \%fsfiles, \%fsdirs);
+
+logger("INFO", "$num_fs_files files still on disk!");
+if ($num_fs_files > 0) {
+    compare_files(\%fsfiles, $archived_files);
+}
+
+logger("INFO", "$warncnt warnings");
+logger("INFO", "$errcnt errors");
+if ($errcnt == 0) {
+    logger("DONE", "" . (scalar keys %$archived_files) . " archived files checked");
+} else {
+    logger("ABORT", "there were $errcnt errors!!");
+    exit 1;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/archiver.pl	Thu Jun 17 17:58:42 2004 +0200
@@ -0,0 +1,407 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+
+use XML::LibXML;
+
+# MPIWG libraries
+use lib '/usr/local/mpiwg/archive';
+use MPIWGStor;
+
+# make output unbuffered
+$|=1;
+
+#######################################################
+# internal parameters
+#
+
+# program version
+my $version = "0.5 (15.1.2004)";
+
+# rewrite XML file (necessary for archive date!)
+my $fix_xml = 1;
+my $xml_changed = 0;
+# XML namespace (not really implemented!)
+my $namespace = "";
+
+# archive name (archive-path element, usually == $docdir)
+my $archname;
+# archive storage date (now)
+my $archdate = stime(time);
+
+# delete "junk" files before archiving
+my $delete_junk_files = 1;
+
+# delete data files after archiving
+my $delete_data_files = 1;
+
+
+#######################################################
+# external programs
+#
+my $archprog = "/opt/tivoli/tsm/client/ba/bin/dsmc";
+if (! -x $archprog) {
+    print "ABORT: TSM client program '$archprog' missing!!\n";
+    exit 1;
+}
+my $checkprog = "/usr/local/mpiwg/archive/metacheck";
+if (! -x $checkprog) {
+    print "ABORT: meta data checking program '$checkprog' missing!!\n";
+    exit 1;
+}
+# log file for archiver
+my $log_file = "/var/log/mpiwg-archiver.log";
+if (! open LOG, ">>$log_file") {
+    print "ABORT: unable to write log file '$log_file'!!\n";
+    exit 1;
+}
+
+#######################################################
+# check parameters that were passed to the program
+#
+if ($#ARGV < 0) {
+    print "ABORT: no document directory given!\n";
+    exit 1;
+}
+my $docdir = $ARGV[0];
+# strip trailing slashes
+$docdir =~ s/\/$//;
+if (! -d $docdir) {
+    print "ABORT: document directory \'$docdir\' doesn't exist!\n";
+    exit 1;
+}
+if (($#ARGV > 0)&&($ARGV[1] eq "-premigrate")) {
+    $delete_data_files = 0;
+}
+
+my $metafile = "$docdir/index.meta";
+if (! -f $metafile) {
+    print "ABORT: metadata index file \'$metafile\' doesn't exist!\n";
+    exit 1;
+}
+
+#######################################################
+# internal variables
+#
+
+# number of errors
+my $errcnt = 0;
+# number of warnings
+my $warncnt = 0;
+
+#######################################################
+# subroutines
+#
+
+#
+# $files = read_resource_meta($rootnode)
+#
+# checks general resource meta information and reads the list of files
+#
+sub read_resource_meta {
+    my ($rootnode) = @_;
+    my %files;
+    #
+    # archive path
+    #
+    # get archive-path
+    $archname = MPIWGStor::sstrip($rootnode->findvalue('child::archive-path'));
+    if (! $archname) {
+	print "ABORT: archive-name element missing!!\n";
+	exit 1;
+    }
+
+    #
+    # files
+    #
+    my @filenodes = $rootnode->findnodes('child::file');
+    foreach my $fn (@filenodes) {
+	my $name = MPIWGStor::sstrip($fn->findvalue('child::name'));
+	my $path = MPIWGStor::sstrip($fn->findvalue('child::path'));
+	print "FILE: ($path)$name\n";
+	my $f = ($path) ? "$path/$name" : "$name";
+	$files{$f} = $name;
+    }
+
+    #
+    # archive-storage-date
+    #
+    my $stordatenode = ($rootnode->find('child::archive-storage-date'))->get_node(1);
+    if ($stordatenode) {
+	print "WARNING: archive storage date exists! Resource already archived?\n";
+	$warncnt++;
+	# delete old date
+	$stordatenode->removeChildNodes;
+    } else {
+	# create new storage date node
+	$stordatenode = $rootnode->addNewChild($namespace, "archive-storage-date");
+	# move after archive-path
+	$rootnode->insertAfter($stordatenode, ($rootnode->find('child::archive-path'))->get_node(1));
+    }
+    $stordatenode->appendTextNode($archdate);
+    $xml_changed++;
+    return \%files;
+}
+
+
+#
+# $%files = run_archive
+#
+# runs the archiver program on $docdir and returns a list of archived files
+#
+sub run_archive {
+    my %files;
+    print LOG "START archiver $version $archdate\n";
+    my $archcmd = $archprog;
+    $archcmd .= " archive -archsymlinkasfile=no -subdir=yes";
+    $archcmd .= " -description='$archname'";
+    $archcmd .= " '$docdir/'";
+
+    print LOG "CMD: $archcmd\n";
+    if (open ARCH, "$archcmd 2>&1 |") {
+	while (<ARCH>) {
+	    chomp;
+	    print LOG "ARCH: $_\n";
+	    if (/Normal File-->\s+[\d,]+\s+(.*)\s+\[Sent\]/) {
+		print "  ARCH: file '$1'\n";
+		$files{$1} = "ok";
+	    }
+	    if (/^Archive processing of .* finished without failure./) {
+		print "  ARCH: OK\n";
+	    }
+	}
+    } else {
+	print "ABORT: unable to start archive command '$archcmd'!!\n";
+	exit 1;
+    }
+
+    return \%files;
+}
+
+
+#
+# check_files(\%files_to_archive, \%archived_files)
+#
+# compares the list of archived and to be archived files
+#
+sub check_files {
+    my ($to_archive, $archived) = @_;
+
+    my $nt = scalar keys %$to_archive;
+    my $na = scalar keys %$archived;
+
+    foreach my $ft (sort keys %$to_archive) {
+	my $fp = "$docdir/$ft";
+	#print "  fp: $fp\n";
+	if ($$archived{$fp}) {
+	    print "DEBUG: $ft archived OK\n";
+	    $$archived{$fp} = "OK";
+	} else {
+	    print "ERROR: file '$ft' missing from archive!\n";
+	    $errcnt++;
+	}
+    }
+
+    foreach my $fa (sort keys %$archived) {
+	if ($$archived{$fa} ne "OK") {
+	    my ($fn, $fp) = MPIWGStor::split_file_path($fa);
+	    if ($MPIWGStor::index_files{$fn}) {
+		print "DEBUG: $fa ignored\n";
+		$na--;
+	    } else {
+		print "WARNING: $fa archived but not in list!\n";
+		$warncnt++;
+	    }
+	}
+    }
+
+    if ($nt > $na) {
+	print "WARNING: less files were archived ($na vs. $nt)!\n";
+    } elsif ($na > $nt) {
+	print "WARNING: more files were archived ($na vs. $nt)!\n";
+    }
+
+}
+
+
+#
+# delete_files(\%files)
+#
+# deletes the files from the list (of absolute files) and their directories
+# if they are empty
+#
+sub delete_files {
+    my ($files) = @_;
+    my %dirs;
+
+    foreach my $f (sort keys %$files) {
+	my ($fn, $fp) = MPIWGStor::split_file_path($f);
+	# collect all unique directories
+        if ($fp && (! $dirs{$fp})) {
+	    $dirs{$fp} = $fp;
+	}
+	# don't delete index files
+	next if ($MPIWGStor::index_files{$fn});
+	# no file no delete
+	next unless (-f $f);
+	# delete files
+	if (unlink $f) {
+	    print "INFO: remove $f ($fn)\n";
+	} else {
+	    print "ERROR: unable to delete $f!\n";
+	    $errcnt++;
+	}
+    }
+    # try to delete all empty directories
+    my @dirkeys = sort keys %dirs;
+    # starting at the end to get to the subdirectories first
+    for (my $i = $#dirkeys; $i >= 0; $i--) {
+	my $d = $dirkeys[$i];
+	# dont't remove document dir (shouldn't be empty anyway)
+	next if ($d eq $docdir);
+	if (-d $d) {
+	    print "INFO: remove dir $d\n";
+	    rmdir $d;
+	}
+    }
+}
+
+
+#
+# delete_all_files(\%files, $dir)
+#
+# deletes all files with names from the list %files
+# in the directory $dir and its subdirectories 
+#
+sub delete_all_files {
+    my ($files, $dir) = @_;
+
+    if (! opendir DIR, $dir) {
+	print "ERROR: unable to read directory $dir!\n";
+	$errcnt++;
+	return;
+    }
+    my @fl = readdir DIR;
+    closedir DIR;
+
+    foreach my $f (@fl) {
+	next if ($f =~ /^\.{1,2}$/);
+	if ($$files{$f}) {
+	    # $f is in the file list
+	    if (-f "$dir/$f") {
+		# $f is a file
+		if (unlink "$dir/$f") {
+		    print "INFO: removed $f\n";
+		} else {
+		    print "ERROR: unable to delete $f!\n";
+		    $errcnt++;
+		}
+	    } elsif (-d _) {
+		# $f is a directory (unlink won't work)
+		if ((system 'rm', '-r', "$dir/$f") == 0) {
+		    print "INFO: removed directory $f\n";
+		} else {
+		    print "ERROR: unable to delete directory $f!\n";
+		    $errcnt++;
+		}
+	    } else {
+		print "ERROR: funny object $dir/$f!\n";
+		$errcnt++;
+	    }
+	} else {
+	    # $f is not in the list
+	    if (-d "$dir/$f") {
+		# recurse into directories
+		print "DEBUG: enter $dir/$f\n";
+		delete_all_files($files, "$dir/$f");
+	    }
+	}
+    }
+}
+
+
+#######################################################
+# main
+#
+
+print "START: archiver $version at $archdate\n";
+
+# make shure the right user is running this program
+my $user = getlogin;
+#if (($user ne "archive")||($user ne "root")) {
+#    logger("ABORT", "you ($user) must be archive or root user to run this program!");
+#    exit 1;
+#}
+
+# use metacheck first
+if (system("$checkprog $docdir >/dev/null") == 0) {
+    print "INFO: resource '$docdir' check OK\n";
+} else {
+    print "ABORT: resource '$docdir' check failed!!\n";
+    exit 1;
+}
+
+# read index.meta file
+my ($document, $rootnode) = MPIWGStor::read_xml($metafile);
+
+# check file and add archive date
+my $files_to_archive = read_resource_meta($rootnode);
+
+print "INFO: ", scalar keys %$files_to_archive, " files to archive\n";
+
+# check for .archived file
+if (-f "$docdir/.archived") {
+    if (unlink "$docdir/.archived") {
+	print "WARNING: existing .archived file has been removed! Resource already archived?\n";
+	$warncnt++;
+    } else {
+	print "ERROR: unable to remove existing .archived file!\n";
+	$errcnt++;
+    }
+}
+
+# remove junk files
+if ($delete_junk_files) {
+    delete_all_files(\%MPIWGStor::junk_files, $docdir);
+}
+
+# write new index.meta
+if ($errcnt > 0) {
+    print "ABORT: there were errors!\n";
+    exit 1;
+} else {
+    if ($fix_xml) {
+	MPIWGStor::write_xml($document, $metafile);
+    }
+}
+
+# start archiving
+my $archived_files = run_archive;
+
+print "INFO: ", scalar keys %$archived_files, " files archived\n";
+
+# check list of archived files
+check_files($files_to_archive, $archived_files);
+
+# delete files if all went OK
+if ($errcnt == 0) {
+    system("touch", "$docdir/.archived");
+    # remove junk files (again)
+    if ($delete_junk_files) {
+	delete_all_files(\%MPIWGStor::junk_files, $docdir);
+    }
+    # remove archived files
+    if ($delete_data_files) {
+	delete_files($archived_files);
+    }
+}
+
+print "INFO: $warncnt warnings\n";
+print "INFO: $errcnt errors\n";
+my $num_archfiles = %$archived_files + 1;
+if ($errcnt > 0) {
+    print "ABORT: there were errors! ($num_archfiles files archived) at ", stime(time), "\n";
+    exit 1;
+} else {
+    print "DONE: $num_archfiles files archived at ", stime(time), "\n";
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/harvestmeta.pl	Thu Jun 17 17:58:42 2004 +0200
@@ -0,0 +1,289 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+use XML::SAX;
+use DBI;
+
+use lib '/usr/local/mpiwg/archive';
+use MPIWGStor;
+use HarvestmetaHandler;
+
+# make output unbuffered
+$|=1;
+
+#######################################################
+# internal parameters
+#
+
+# program version
+my $version = "0.1 (08.06.2004)";
+
+# read command line parameters
+my $args = MPIWGStor::parseargs;
+
+# debug level
+$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
+
+# XML namespace (not really implemented!)
+my $namespace = "";
+
+# delete and rebuild database
+my $purgeDB = (exists $$args{'purgedb'});
+
+# database connection
+my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
+if (! $dbh) {
+    logger('ABORT', "unable to connect to database!");
+    exit 1;
+}
+$dbh->{AutoCommit} = 0;
+my $dbNextFileId;
+my $dbNewFile;
+my $dbNewMeta;
+my $dbClearMeta;
+my $dbFindFileName;
+my $dbFindFilePath;
+my $dbClearFile;
+my $dbFindFileFlag;
+my $dbFindFileFlagPath;
+my $dbSetFileFlag;
+my $dbClearAllFileFlag;
+
+#######################################################
+# check parameters that were passed to the program
+#
+my $basedir = $$args{'path'};
+if (! $basedir) {
+    logger("ABORT", "no document directory given!");
+    exit 1;
+}
+# strip trailing slashes
+$basedir =~ s/\/$//;
+if (! -d $basedir) {
+    logger("ABORT", "document directory \'$basedir\' doesn't exist!");
+    exit 1;
+}
+
+my $metaParserHandler = HarvestmetaHandler->new;
+my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
+
+#######################################################
+# internal variables
+#
+
+# number of errors
+my $errcnt = 0;
+# number of warnings
+my $warncnt = 0;
+
+# number of files on fs
+my $fcnt = 0;
+# number of index files
+my $idxcnt = 0;
+
+#######################################################
+# subroutines
+#
+
+#
+# readAllFiles($realdir, $basedir, \%files, \%dirs)
+#
+# reads all files and directories below $realdir and puts the
+# files in %files and directories in %dirs
+# $basedir is only for recursion, it should be empty when called 
+# from outside
+#
+sub readAllFiles {
+    my ($directory, $basedir) = @_;    
+    my $cnt = 0;
+
+    if (! opendir DIR, $directory) {
+	return 0;
+    }
+    my @dirfiles = readdir DIR;
+    foreach my $fn (@dirfiles) {
+	# ignore names starting with a dot
+	next if ($fn =~ /^\./);
+	# ignore other silly files
+	next if ($junk_files{$fn});
+
+	$cnt++;
+	$fcnt++;
+	my $f = "$directory/$fn";
+	my $docf = ($basedir) ? "$basedir/$fn" : $fn;
+	#logger('DEBUG', "fs_file: \"$f\"");
+	if (-f $f) {
+	    #logger("  is file");
+	    if ($fn eq "index.meta") {
+		harvestFile($fn, $directory);
+	    }
+	} elsif (-d _) {
+	    #logger("  is dir");
+	    # recurse into directory
+	    $cnt += readAllFiles($f, $docf);
+	}
+    }
+    return $cnt;
+}
+
+#
+# cleanUnmarkedFiles($basepath)
+#
+# deletes all unflagged file and meta entries.
+#
+sub cleanUnmarkedFiles {
+    my ($basepath) = @_;
+    my $rv = $dbFindFileFlagPath->execute("${basepath}%");
+    my $ids = $dbFindFileFlagPath->fetchall_arrayref;
+    for my $i (@$ids) {
+	my $id = $$i[0];
+	logger('DEBUG', "cleaning file and meta of id: $id");
+	$dbClearMeta->execute($id);
+	$dbClearFile->execute($id);
+	$dbh->commit;
+    }
+}
+
+#
+# harvestFile($filename, $filepath)
+#
+# reads the index file $filename at $filepath and puts the contents
+# in the database.
+#
+sub harvestFile {
+    my ($filename, $filepath) = @_;
+    logger('DEBUG', "looking at file '$filename' at '$filepath'");
+    # get file time
+    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+	$atime,$mtime,$ctime,$blksize,$blocks)
+	= stat("$filepath/$filename");
+    my $filetime = stime($mtime);
+    # register file in db
+    my $fid = registerFile("$filepath/$filename", $filetime);
+    if ($fid) {
+	# file is new/modified
+	# parse index file
+	$metaParser->parse_uri("$filepath/$filename");
+	my @data = $metaParserHandler->getData();
+	logger('DEBUG', "parsed $#data+1 elements");
+	registerMeta($fid, @data);
+    }
+    $idxcnt++;
+    logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
+}
+
+#
+# $fileid = registerFile($filepath, $filetime)
+#
+# returns the file ID for the file $filepath. If necessary it
+# will be added to the database. returns 0 if an update is not necessary.
+#
+sub registerFile {
+    my ($filepath, $filetime) = @_;
+    my $fileid = 0;
+    # look if file is in db
+    my $rv = $dbFindFileName->execute($filepath);
+    my $mtime;
+    ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
+    if ($fileid) {
+	# file is in db
+	# update flag
+	$dbSetFileFlag->execute($fileid, 1);
+	$dbh->commit;
+	my $stime = s2stime($mtime);
+	if ($stime ge $filetime) {
+	    # if its current return 0
+	    logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
+	    return 0;
+	} else {
+	    logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
+	}
+    }
+    if (! $fileid) {
+	# get a new file id
+	my $rv = $dbNextFileId->execute;
+	($fileid) = $dbNextFileId->fetchrow_array;
+	logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
+	$dbNewFile->execute($fileid, $filepath, $filetime);
+	# update flag
+	$dbSetFileFlag->execute($fileid, 1);
+	$dbh->commit;
+    }
+    return $fileid;
+}
+
+#
+# registerMeta($fileid, @meta)
+#
+# adds the metadata information @meta for $fileid to the database.
+#
+sub registerMeta {
+    my ($fileid, @meta) = @_;
+    logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
+    my $idx = 0;
+    foreach my $keyval (@meta) {
+	#logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");
+	$dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
+    }
+    $dbh->commit;
+    logger('INFO', "added $idx elements (file $fileid)");
+}
+
+#
+# initdb()
+#
+# initialises the database connection.
+#
+sub initDB {
+    my $rv;
+    # clean tables
+    if ($purgeDB) {
+	$rv = $dbh->do("delete from files");
+	$rv = $dbh->do("delete from meta");
+	if ($dbh->err) {
+	    logger('ABORT', "unable to clean table!");
+	    exit 1;
+	}
+	$dbh->commit;
+    }
+
+    # clear flags
+    $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
+    $dbh->commit;
+
+    # prepare statements
+    $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
+    $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
+    $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
+    $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
+    $dbClearFile = $dbh->prepare("delete from files where id=?");
+    $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
+    $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
+    $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
+    $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
+    $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
+
+}
+
+#######################################################
+# main
+#
+
+logger("INFO", "harvestmeta $version");
+ 
+initDB();
+
+# read and process all files under $basedir
+my $fnum = readAllFiles($basedir, "");
+# delete orphaned data (under $basedir)
+cleanUnmarkedFiles($basedir);
+
+logger("INFO", "analysed $idxcnt of $fnum files!");
+logger("INFO", "$warncnt warnings");
+logger("INFO", "$errcnt errors");
+if ($errcnt > 0) {
+    logger("ABORT", "there were errors!");
+    exit 1;
+} else {
+    logger("DONE", "all index files read successfully!");
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metacheck.pl	Thu Jun 17 17:58:42 2004 +0200
@@ -0,0 +1,414 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+use XML::LibXML;
+
+use lib '/usr/local/mpiwg/archive';
+use MPIWGStor;
+
+# make output unbuffered
+$|=1;
+
+#######################################################
+# internal parameters
+#
+
+# program version
+my $version = "0.5.2 (7.1.2004)";
+
+# read command line parameters
+my $args = MPIWGStor::parseargs;
+
+# debug level
+$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
+
+# check only or fix index file also
+my $check_only = (exists $$args{'checkonly'}) ? $$args{'checkonly'} : 0;
+# add file tags for missing files
+my $fix_files = ! $check_only;
+# add tags for file size and date
+my $fix_fs_meta = 1;
+# add dir tags for missing directories
+my $fix_dirs = ! $check_only;
+# rewrite XML file (necessary for fix_files and fix_dirs)
+my $fix_xml = ! $check_only;
+my $xml_changed = 0;
+# XML namespace (not really implemented!)
+my $namespace = "";
+
+
+#######################################################
+# check parameters that were passed to the program
+#
+my $docdir = $$args{'path'};
+if (! $docdir) {
+    logger("ABORT", "no document directory given!");
+    exit 1;
+}
+# strip trailing slashes
+$docdir =~ s/\/$//;
+if (! -d $docdir) {
+    logger("ABORT", "document directory \'$docdir\' doesn't exist!");
+    exit 1;
+}
+
+my $metafile = "$docdir/index.meta";
+if (! -f $metafile) {
+    logger("ABORT", "metadata index file \'$metafile\' doesn't exist!");
+    exit 1;
+}
+
+#######################################################
+# internal variables
+#
+
+# all files in the document directory tree
+my %files;
+# all directories in the document directory tree
+my %dirs;
+# number of errors
+my $errcnt = 0;
+# number of warnings
+my $warncnt = 0;
+
+#######################################################
+# subroutines
+#
+
+#
+# fs_read_files($realdir, $docdir, \%files, \%dirs)
+#
+# reads all files and directories below $realdir and puts the
+# files in %files and directories in %dirs
+# $docdir is only for recursion, it should be empty when called 
+# from outside
+#
+sub fs_read_files {
+    my ($directory, $docdir, $files, $dirs) = @_;    
+    my $cnt = 0;
+
+    if (! opendir DIR, $directory) {
+	return 0;
+    }
+    my @dirfiles = readdir DIR;
+    foreach my $fn (@dirfiles) {
+	# ignore names starting with a dot
+	next if ($fn =~ /^\./);
+	# ignore other silly files
+	next if ($junk_files{$fn});
+
+	$cnt++;
+	my $f = "$directory/$fn";
+	my $docf = ($docdir) ? "$docdir/$fn" : $fn;
+	#logger("fs_file: \"$f\"");
+	if (-f $f) {
+	    #logger("  is file");
+	    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+                      $atime,$mtime,$ctime,$blksize,$blocks)
+                          = stat(_); 
+	    $$files{$docf} = [$fn, $size, $mtime];
+	} elsif (-d _) {
+	    #logger("  is dir");
+	    $$dirs{$docf} = $fn;
+	    # recurse into directory
+	    $cnt += fs_read_files($f, $docf, $files, $dirs);
+	}
+    }
+    return $cnt;
+}
+
+
+#
+# check_dirs($rootnode, \%dirs)
+#
+# reads all dir elements under $rootnode and compares with the directory list
+# in %dirs
+#
+sub check_dirs {
+    my ($root, $fsdirs) = @_;
+
+    #
+    # iterate all dir tags
+    #
+    my @dirnodes = $root->findnodes('child::dir');
+    my %okdirs;
+    foreach my $dirnode (@dirnodes) {
+	my $dirname = sstrip($dirnode->find('child::name'));
+	my $dirpath = sstrip($dirnode->find('child::path'));
+	my $description = sstrip($dirnode->find('child::description'));
+	# name must be valid
+	if (! valid_dir_name($dirname)) {
+	    logger("ERROR", "directory name ($dirpath) $dirname invalid!");
+	    $errcnt++;
+	}
+	# description can be present
+	if (! $description) {
+	    logger("WARNING", "description for directory $dirname (in $dirpath/) missing!");
+	    $warncnt++;
+	}
+	# check with dirs on filesystem 
+	my $fn;
+	if ($dirpath) {
+	    $fn = "$dirpath/$dirname";
+	} else {
+	    $fn = "$dirname";
+	}
+        #logger("dir: \"$dirname\", \"$dirpath\"");
+	if ($$fsdirs{$fn}) {
+	    #logger("  OK ($$fsdirs{$fn})");
+	    $okdirs{$fn} = $dirname;
+	} else {
+	    logger("ERROR", "directory $dirname (in $dirpath/) missing on disk!");
+	    $errcnt++;
+	}
+    }
+    #logger("dirs: ", (scalar keys %$fsdirs), " vs ", (scalar keys %okdirs), "");
+    if ((scalar keys %$fsdirs) != (scalar keys %okdirs)) {
+	# number of dir tags and dirs don't match
+	# iterate through all dirs
+	foreach my $f (sort keys %$fsdirs) {
+	    # was this dir missing?
+	    if (! $okdirs{$f}) {
+		my ($name, $path) = split_file_path($f);
+		# name must be valid
+		if (! valid_dir_name($name)) {
+		    $path = "." unless ($path);
+		    logger("ERROR", "directory name $name (in $path/) invalid!");
+		    $errcnt++;
+		    next;
+		}
+		if ($fix_dirs) {
+		    # add missing dir tag
+		    my $dir_node = $root->addNewChild($namespace, "dir");
+		    $xml_changed++;
+		    # add name
+		    my $name_node = $dir_node->addNewChild($namespace, "name");
+		    $name_node->appendTextNode($name);
+		    # add path
+		    if ($path) {
+			my $path_node = $dir_node->addNewChild($namespace, "path");
+			$path_node->appendTextNode($path);
+		    }
+		    logger("INFO", "directory $f to be added to index file!");
+		} else {
+		    logger("ERROR", "directory $f missing in index file!");
+		    $errcnt++;
+		}
+	    }
+	}
+    }
+}
+
+
+#
+# check_files($rootnode, \%files)
+#
+# reads all file elements under $rootnode and compares with the file list
+# in %files
+#
+sub check_files {
+    my ($root, $fsfiles) = @_;
+
+    #
+    # iterate all file tags
+    #
+    my @filenodes = $root->findnodes('child::file');
+    my %okfiles;
+    foreach my $filenode (@filenodes) {
+	my $filename = sstrip($filenode->find('child::name'));
+	my $filepath = sstrip($filenode->find('child::path'));
+	my $filesize = sstrip($filenode->find('child::size'));
+	my $filedate = sstrip($filenode->find('child::date'));
+	# name must be valid
+	if (! valid_file_name($filename)) {
+	    logger("ERROR", "file name ($filepath)$filename invalid!");
+	    $errcnt++;
+	}
+	my $fn = ($filepath) ? "$filepath/$filename" : "$filename";
+        #logger("file: \"$filename\", \"$filepath\"");
+	if ($$fsfiles{$fn}) {
+	    #logger("  OK ($$fsfiles{$fn})");
+	    $okfiles{$fn} = $filename;
+	    # check file size and date
+	    if ($filesize) {
+		if ($filesize != $$fsfiles{$fn}->[1]) {
+		    logger("WARNING", "size of file $fn changed: $filesize to $$fsfiles{$fn}->[1]");
+		    $warncnt++;
+		}
+	    }
+	    # file date
+	    if ($filedate) {
+		if ($filedate ne stime($$fsfiles{$fn}->[2])) {
+		    logger("WARNING", "date of file $fn changed: $filedate to ", stime($$fsfiles{$fn}->[2]), "");
+		    $warncnt++;
+		}
+	    }
+	    # update file size and date
+	    if ($fix_fs_meta) {
+		# delete size and date
+		foreach my $n ($filenode->findnodes('child::size')) {
+		    $filenode->removeChild($n);
+		}
+		foreach my $n ($filenode->findnodes('child::date')) {
+		    $filenode->removeChild($n);
+		}
+		# add new size and date
+		my $node = $filenode->addNewChild($namespace, "size");
+		$node->appendTextNode($$fsfiles{$fn}->[1]);
+		$node = $filenode->addNewChild($namespace, "date");
+		$node->appendTextNode(stime($$fsfiles{$fn}->[2]));
+		$xml_changed++;
+	    }
+	} else {
+	    logger("ERROR", "file $filename (in $filepath/) missing on disk!");
+	    $errcnt++;
+	}
+    }
+    #logger("files: ", (scalar keys %$fsfiles), " vs ", (scalar keys %okfiles), "");
+    if ((scalar keys %$fsfiles) != (scalar keys %okfiles)) {
+	# number of file tags and files don't match
+	# iterate through all files
+	foreach my $f (sort keys %$fsfiles) {
+	    my ($name, $path) = split_file_path($f);
+	    # was this file missing?
+	    if (! $okfiles{$f}) {
+		# is an ignoreable file?
+		if ($index_files{$name}) {
+		    next;
+		}
+		# name must be valid
+		if (! valid_dir_name($name)) {
+		    $path = "." unless ($path);
+		    logger("ERROR", "file name $name (in $path/) invalid!");
+		    $errcnt++;
+		    next;
+		}
+		if ($fix_files) {
+		    # add missing file tag
+		    my $file_node = $root->addNewChild($namespace, "file");
+		    $xml_changed++;
+		    # add name
+		    my $name_node = $file_node->addNewChild($namespace, "name");
+		    $name_node->appendTextNode($name);
+		    # add path
+		    if ($path) {
+			my $path_node = $file_node->addNewChild($namespace, "path");
+			$path_node->appendTextNode($path);
+		    }
+		    # add size
+		    my $size_node = $file_node->addNewChild($namespace, "size");
+		    $size_node->appendTextNode($$fsfiles{$f}->[1]);
+		    # add date
+		    my $date_node = $file_node->addNewChild($namespace, "date");
+		    $date_node->appendTextNode(stime($$fsfiles{$f}->[2]));
+		    logger("INFO", "file $f to be added to index file!");
+		} else {
+		    logger("ERROR", "file $f missing in index file!");
+		    $errcnt++;
+		}
+	    }
+	}
+    }
+}
+
+#
+# check_resource_meta($rootnode)
+#
+# checks general resource meta information
+#
+sub check_resource_meta {
+    my ($rootnode) = @_;
+
+    #
+    # description
+    #
+    my $description = $rootnode->findvalue('child::description');
+    if (! $description) {
+	logger("ERROR", "resource description element missing!");
+	$errcnt++;
+    }
+    #
+    # name
+    #
+    my $name = sstrip($rootnode->findvalue('child::name'));
+    if ($name) {
+	my ($dirname, $dirpath) = split_file_path($docdir);
+	if ($dirname ne $name) {
+	    logger("ERROR", "resource name element '$name' does not match directory name '$dirname'!");
+	    $errcnt++;
+	}
+    } else {
+	logger("ERROR", "resource name element missing!");
+	$errcnt++;
+    }
+    #
+    # archive path
+    #
+    my $realpath;
+    # get real path
+    if ($docdir =~ /^\//) {
+	# docdir is absolute
+	$realpath = $docdir;
+    } else {
+	# docdir is relative -- try with the shell
+	if (open PWDCMD, "cd $docdir ; pwd|") {
+	    $realpath = <PWDCMD>;
+	    chomp $realpath;
+	}
+    }
+    if (! $realpath) {
+	logger("ERROR", "unable to check real archive path!");
+	$errcnt++;
+	return;
+    }
+    # get archive-path
+    my $archnode = ($rootnode->find('child::archive-path'))->get_node(1);
+    if ($archnode) {
+	my $arch = sstrip($archnode->textContent);
+	if ($arch ne $realpath) {
+	    logger("WARNING", "incorrect archive-path '$arch' will be changed to '$realpath'!");
+	    $warncnt++;
+	    # correct archive-path
+	    $archnode->removeChildNodes;
+	    $archnode->appendTextNode($realpath);
+	    $xml_changed++;
+	}
+    } else {
+	# add archive-path
+	$archnode = $rootnode->addNewChild($namespace, "archive-path");
+	$archnode->appendTextNode($realpath);
+	$xml_changed++;
+    }
+
+}
+
+    
+
+#######################################################
+# main
+#
+
+logger("INFO", "metacheck $version");
+    
+my ($document, $rootnode) = read_xml($metafile);
+
+check_resource_meta($rootnode);
+
+my $fnum = fs_read_files($docdir, "", \%files, \%dirs);
+logger("INFO", "$fnum files on FS");
+#foreach (keys %files) {logger("  file ($_): $files{$_}");}
+
+check_files($rootnode, \%files);
+check_dirs($rootnode, \%dirs);
+
+logger("INFO", "$warncnt warnings");
+logger("INFO", "$errcnt errors");
+if ($errcnt > 0) {
+    logger("ABORT", "there were errors!");
+    exit 1;
+} else {
+    if ($fix_xml) {
+	write_xml($document, $metafile);
+    }
+    logger("DONE", "index file checked successfully!");
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/unarchiver.pl	Thu Jun 17 17:58:42 2004 +0200
@@ -0,0 +1,327 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+
+use XML::LibXML;
+
+# MPIWG libraries
+use lib '/usr/local/mpiwg/archive';
+use MPIWGStor;
+
+# make output unbuffered
+$|=1;
+
+
+#######################################################
+# internal parameters
+#
+
+# program version
+my $version = "0.1 (24.9.2003)";
+
+# read command line parameters
+my $args = parseargs;
+
+# debug level
+$debug = (exists $$args{'debug'}) ? ($$args{'debug'}) : 0;
+
+# rewrite XML file (necessary for archive date!)
+my $fix_xml = 1;
+my $xml_changed = 0;
+# XML namespace (not really implemented!)
+my $namespace = "";
+
+# archive name (archive-path element, usually == $docdir)
+my $archname;
+# archive storage date
+my $archdate;
+
+#######################################################
+# external programs
+#
+my $archprog = "/opt/tivoli/tsm/client/ba/bin/dsmc";
+if (! -x $archprog) {
+    logger("ABORT", "TSM client program '$archprog' missing!!");
+    exit 1;
+}
+my $checkprog = "/usr/local/mpiwg/archive/archivecheck";
+if (! -x $checkprog) {
+    logger("ABORT", "archive checking program '$checkprog' missing!!");
+    exit 1;
+}
+# log file for archiver
+my $log_file = "/var/tmp/unarchiver.log";
+if (! open LOG, ">>$log_file") {
+    logger("ABORT", "unable to write log file '$log_file'!!");
+    exit 1;
+}
+
+#######################################################
+# check parameters that were passed to the program
+#
+my $docdir = $$args{'path'};
+if (! $docdir) {
+    print "ABORT: no document directory given!\n";
+    exit 1;
+}
+# strip trailing slashes
+$docdir =~ s/\/$//;
+if (! -d $docdir) {
+    print "ABORT: document directory \'$docdir\' doesn't exist!\n";
+    exit 1;
+}
+
+my $metafile = "$docdir/index.meta";
+if (! -f $metafile) {
+    print "ABORT: metadata index file \'$metafile\' doesn't exist!\n";
+    exit 1;
+}
+
+
+#######################################################
+# internal variables
+#
+
+# number of errors
+my $errcnt = 0;
+# number of warnings
+my $warncnt = 0;
+
+#######################################################
+# subroutines
+#
+
+
+#
+# $files = read_resource_meta($rootnode)
+#
+# checks general resource meta information and reads the list of files
+#
+sub read_resource_meta {
+    my ($rootnode) = @_;
+    my %files;
+    #
+    # archive path
+    #
+    # get archive-path
+    $archname = sstrip($rootnode->findvalue('child::archive-path'));
+    if (! $archname) {
+	logger("ABORT", "archive-name element missing!!");
+	exit 1;
+    }
+
+    #
+    # files
+    #
+    my @filenodes = $rootnode->findnodes('child::file');
+    foreach my $fn (@filenodes) {
+	my $name = sstrip($fn->findvalue('child::name'));
+	my $path = sstrip($fn->findvalue('child::path'));
+	logger("DEBUG", "FILE: ($path)$name");
+	my $f = ($path) ? "$path/$name" : "$name";
+	$files{$f} = [$name];
+    }
+
+    #
+    # dirs
+    #
+    my @dirnodes = $rootnode->findnodes('child::dir');
+    foreach my $fn (@dirnodes) {
+	my $name = sstrip($fn->findvalue('child::name'));
+	my $path = sstrip($fn->findvalue('child::path'));
+	logger("DEBUG", "DIR: ($path)$name");
+	my $f = ($path) ? "$path/$name" : "$name";
+	$files{$f} = [$name];
+    }
+
+    #
+    # archive-storage-date
+    #
+    my $archdate = $rootnode->find('child::archive-storage-date');
+    if ($archdate) {
+	logger("INFO", "archive storage date: $archdate");
+    } else {
+	logger("ERROR", "archive storage date missing!");
+	$errcnt++;
+    }
+
+    #
+    # archive-recall-date
+    #
+    my $recalldatenode = ($rootnode->find('child::archive-recall-date'))->get_node(1);
+    if ($recalldatenode) {
+	print "INFO: archive recall date exists!\n";
+	# delete old date
+	$recalldatenode->removeChildNodes;
+    } else {
+	# create new storage date node
+	$recalldatenode = $rootnode->addNewChild($namespace, "archive-recall-date");
+	# move after archive-path
+	$rootnode->insertAfter($recalldatenode, ($rootnode->find('child::archive-storage-date'))->get_node(1));
+    }
+    $recalldatenode->appendTextNode(scalar localtime);
+    $xml_changed++;
+
+    return \%files;
+}
+
+
+#
+# $%files = run_retrieve
+#
+# runs the retriever program on $docdir and returns a list of archived files
+#
+# Sample output:
+# Retrieving          17,234 /mpiwg/archive/data/test/auto_titit_123/pageimg/essen-wind1.jpg [Done]
+#
+sub run_retrieve {
+    my %files;
+    print LOG "START unarchive $version ", scalar localtime, "\n";
+    my $archcmd = $archprog;
+    $archcmd .= " retrieve -subdir=yes -replace=all";
+    $archcmd .= " -description='$archname'";
+    $archcmd .= " '$docdir/'";
+
+    my $archcnt = 0;
+    print LOG "CMD: $archcmd\n";
+    if (open ARCH, "$archcmd 2>&1 |") {
+	while (<ARCH>) {
+	    chomp;
+	    print LOG "ARCH: $_\n";
+	    if (/
+		Retrieving
+		\s+([\d,]+)    # size
+		\s+(\S+)       # file name
+		\s+\[Done\]
+		/x) {
+		my $size = $1;
+		my $file = $2;
+		$size =~ s/,//g;
+		logger("DEBUG", "  RETRIEVE: file '$file'");
+		$archcnt++;
+		if ($files{$file}) {
+		    logger("WARNING", "file $file seems to be archived multiple times.");
+		    $warncnt++;
+		} 
+		$files{$file} = [$size];
+	    }
+	}
+	logger("INFO", "$archcnt archives of " . (scalar keys %files) . " files.");
+    } else {
+	logger("ABORT", "unable to start archive command '$archcmd'!!");
+	exit 1;
+    }
+    return \%files;
+}
+
+
+#
+# check_files(\%files_to_retrieve, \%retrieved_files)
+#
+# compares the list of archived and retrieved files
+#
+sub check_files {
+    my ($to_retrieve, $retrieved) = @_;
+
+    my $nt = scalar keys %$to_retrieve;
+    my $na = scalar keys %$retrieved;
+
+    foreach my $ft (sort keys %$to_retrieve) {
+	my $fp = "$docdir/$ft";
+	#logger("DEBUG", "  fp: $fp");
+	if ($$retrieved{$fp}) {
+	    logger("DEBUG", "$ft retrieved OK");
+	    $$retrieved{$fp}->[1] = "OK";
+	} else {
+	    logger("ERROR", "file entry '$ft' missing from archive!");
+	    $errcnt++;
+	}
+    }
+
+    foreach my $fa (sort keys %$retrieved) {
+	if (! $$retrieved{$fa}->[1]) {
+	    my ($fn, $fp) = split_file_path($fa);
+	    if ($index_files{$fn}) {
+		logger("DEBUG", "$fa ignored");
+		$na--;
+	    } else {
+		logger("WARNING", "$fa retrieved but not in list!");
+		$warncnt++;
+	    }
+	}
+    }
+
+    if ($nt > $na) {
+	logger("WARNING", "less files were retrieved ($na vs. $nt)!");
+	$warncnt++;
+    } elsif ($na > $nt) {
+	logger("WARNING", "more files were retrieved ($na vs. $nt)!");
+	$warncnt++;
+    }
+
+}
+
+
+
+#######################################################
+# main
+#
+
+logger("INFO", "unarchiver $version");
+
+# make shure the right user is running this program
+my $user = getlogin;
+if (($user ne "archive")&&($user ne "root")) {
+    logger("ABORT", "you must be archive or root user to run this program!");
+    exit 1;
+}
+
+# use checkarchive first
+if (system("$checkprog $docdir >/dev/null") == 0) {
+    logger("INFO", "archive '$docdir' check OK");
+} else {
+    logger("ABORT", "archive '$docdir' check failed!!");
+    exit 1;
+}
+
+# read index.meta file
+my ($document, $rootnode) = read_xml($metafile);
+
+# check index file
+my $archived_files = read_resource_meta($rootnode);
+my $num_archived_files = scalar keys %$archived_files;
+
+# check for .archived file
+if (-f "$docdir/.archived") {
+    logger("INFO", ".archived file exists.");
+} else {
+    logger("WARNING", "no .archived file!");
+    $warncnt++;
+}
+
+logger("INFO", "$num_archived_files files to retrieve.");
+
+# retrieve
+my $retrieved_files = run_retrieve;
+
+my $num_arch_files = (scalar keys %$retrieved_files);
+if ($num_arch_files == 0) {
+    logger("ABORT", "no files retrieved!!");
+    exit 1;
+}
+logger("INFO", "$num_arch_files files retrieved");
+
+# check list of archived files
+check_files($archived_files, $retrieved_files);
+
+# rewrite index.meta file
+write_xml($document, $metafile);
+
+logger("INFO", "$warncnt warnings");
+logger("INFO", "$errcnt errors");
+if ($errcnt == 0) {
+    logger("DONE", "" . (scalar keys %$retrieved_files) . " archived files retrieved");
+} else {
+    logger("ABORT", "there were $errcnt errors!!");
+    exit 1;
+}