File:  [Repository] / foxridge-archiver / unarchiver.pl
Revision 1.9: download - view: text, annotated - select for diffs - revision graph
Thu Mar 16 17:00:43 2017 UTC (7 years, 1 month ago) by casties
Branches: MAIN
CVS tags: HEAD
updated to Ubuntu Perl paths.

#!/usr/bin/perl -w

use strict;

use XML::LibXML;
use FileHandle;

# MPIWG libraries
use lib '/usr/local/mpiwg/archive';
use MPIWGStor;

# make output unbuffered
$|=1;


#######################################################
# internal parameters
#

# program version
my $version = "0.3.4 (11.8.2009)";

# read command line parameters
my $args = parseargs;

# debug level
$debug = (exists $$args{'debug'}) ? ($$args{'debug'}) : 0;

# rewrite XML file (necessary for archive date!)
my $fix_xml = 1;
my $xml_changed = 0;
# XML namespace (not really implemented!)
my $namespace = "";

# archive name (archive-path element, usually == $docdir)
my $archname;
# archive storage date
my $archdate;

#######################################################
# external programs
#
my $archprog = "/opt/tivoli/tsm/client/ba/bin/dsmc";
if (! -x $archprog) {
    logger("ABORT", "TSM client program '$archprog' missing!!");
    exit 1;
}
my $checkprog = "/usr/local/mpiwg/archive/archivecheck";
if (! -x $checkprog) {
    logger("ABORT", "archive checking program '$checkprog' missing!!");
    exit 1;
}
# log file for archiver
my $log_file = "/var/tmp/unarchiver.log";
if (! open LOG, ">>$log_file") {
    logger("ABORT", "unable to write log file '$log_file'!!");
    exit 1;
}
LOG->autoflush(1);

#######################################################
# check parameters that were passed to the program
#
my $docdir = $$args{'path'};
if (! $docdir) {
    print "ABORT: no document directory given!\n";
    exit 1;
}
# strip trailing slashes
$docdir =~ s/\/$//;
if (! -d $docdir) {
    print "ABORT: document directory \'$docdir\' doesn't exist!\n";
    exit 1;
}

my $metafile = "$docdir/index.meta";
if (! -f $metafile) {
    print "ABORT: metadata index file \'$metafile\' doesn't exist!\n";
    exit 1;
}

#######################################################
# internal variables
#

# number of errors
my $errcnt = 0;
# number of warnings
my $warncnt = 0;

#######################################################
# subroutines
#

# construct document's parent dir
sub get_parent {
    my ($dirname) = @_;
    my $dirparent = $dirname;
    $dirparent =~ s!/[^/]+$!!;
    return $dirparent;
}


#
# $files = read_resource_meta($rootnode)
#
# checks general resource meta information and reads the list of files
#
sub read_resource_meta {
    my ($rootnode) = @_;
    my %files;
    #
    # archive path
    #
    # get archive-path
    $archname = sstrip($rootnode->findvalue('child::archive-path'));
    if (! $archname) {
	logger("ABORT", "archive-name element missing!!");
	exit 1;
    }

    #
    # files
    #
    my @filenodes = $rootnode->findnodes('child::file');
    foreach my $fn (@filenodes) {
	my $name = sstrip($fn->findvalue('child::name'));
	my $path = sstrip($fn->findvalue('child::path'));
	logger("DEBUG", "FILE: ($path)$name");
	my $f = ($path) ? "$path/$name" : "$name";
	$files{$f} = [$name];
    }

    #
    # dirs
    #
    my @dirnodes = $rootnode->findnodes('child::dir');
    foreach my $fn (@dirnodes) {
	my $name = sstrip($fn->findvalue('child::name'));
	my $path = sstrip($fn->findvalue('child::path'));
	logger("DEBUG", "DIR: ($path)$name");
	my $f = ($path) ? "$path/$name" : "$name";
	$files{$f} = [$name];
    }

    #
    # archive-storage-date
    #
    my $archdate = $rootnode->find('child::archive-storage-date');
    if ($archdate) {
	logger("INFO", "archive storage date: $archdate");
    } else {
	logger("ERROR", "archive storage date missing!");
	$errcnt++;
    }

    #
    # archive-recall-date
    #
    my $recalldatenode = ($rootnode->find('child::archive-recall-date'))->get_node(1);
    if ($recalldatenode) {
	print "INFO: archive recall date exists!\n";
	# delete old date
	$recalldatenode->removeChildNodes;
    } else {
	# create new storage date node
	$recalldatenode = $rootnode->addNewChild($namespace, "archive-recall-date");
	# move after archive-path
	$rootnode->insertAfter($recalldatenode, ($rootnode->find('child::archive-storage-date'))->get_node(1));
    }
    $recalldatenode->appendTextNode(scalar localtime);
    $xml_changed++;

    return \%files;
}


#
# $num_files = run_retrieve($docdir, $docmount, \%files)
#
# Runs the retriever program on $docdir and returns the number of unarchived files.
# All filenames are put in %files. 
# $docmount is the mount point of the doc partition in cases when the new mount point
# is different.
#
# Sample output:
# (old!) Retrieving          17,234 /mpiwg/archive/data/test/auto_titit_123/pageimg/essen-wind1.jpg [Done]
# Retrieving      42,406,326 /mpiwg/archive/data/library/B980G582/raw/00015.tif --> /mpiwg/archive/data/library/B980G582/raw/00015.tif [Done]
sub run_retrieve {
    my ($archdir, $archmount, $files) = @_;
    my $archparent;
    if ($archmount eq $archdir) {
	# no explicit mount point
	$archparent = get_parent($archdir);
    } else {
	# destination dir is mount point
	$archparent = $archmount;
    }
    logger("INFO", "looking for archives in $archmount...");

    print LOG "START unarchive $version on ", scalar localtime, "\n";
    my $archcmd = $archprog;
    $archcmd .= " retrieve -subdir=yes -replace=all -ifnewer";
    $archcmd .= " -description='$archname'"; # archive name
    $archcmd .= " '$archmount/'"; # archive mount point
    $archcmd .= " '$archparent/'"; # destination dir name

    logger('INFO', "querying TSM server for $archmount, please wait...");

    my $archcnt = 0;
    my $numfiles = 0;
    print LOG "CMD: $archcmd\n";
    if (open ARCH, "$archcmd 2>&1 |") {
	while (<ARCH>) {
	    chomp;
	    print LOG "ARCH: $_\n";
	    if (/
		Retrieving
		\s+([\d,]+)    # size
		\s+(\S+)       # file name
		\s+-->
		\s+(\S+)       # destination file name
		\s+\[Done\]
		/x) {
		my $size = $1;
		my $file = $2;
		$size =~ s/,//g;
		logger("DEBUG", "  RETRIEVE: file '$file'");
		$archcnt++;
		if ($$files{$file}) {
		    logger("WARNING", "file $file seems to be archived multiple times.");
		    $warncnt++;
		} 
		$$files{$file} = [$size];
	    }
	}
	$numfiles =  (scalar keys %$files);
	logger("INFO", "$archcnt archives of $numfiles files (in $archmount).");
    } else {
	logger("ABORT", "unable to start archive command '$archcmd'!!");
	exit 1;
    }
    return $numfiles;
}


#
# check_files(\%files_to_retrieve, \%retrieved_files)
#
# compares the list of archived and retrieved files
#
sub check_files {
    my ($to_retrieve, $retrieved) = @_;

    my $nt = scalar keys %$to_retrieve;
    my $na = scalar keys %$retrieved;

    foreach my $ft (sort keys %$to_retrieve) {
	my $fp = "$docdir/$ft";
	#logger("DEBUG", "  fp: $fp");
	if ($$retrieved{$fp}) {
	    logger("DEBUG", "$ft retrieved OK");
	    $$retrieved{$fp}->[1] = "OK";
	} else {
	    logger("ERROR", "file entry '$ft' missing from archive!");
	    $errcnt++;
	}
    }

    foreach my $fa (sort keys %$retrieved) {
	if (! $$retrieved{$fa}->[1]) {
	    my ($fn, $fp) = split_file_path($fa);
	    if ($index_files{$fn}) {
		logger("DEBUG", "$fa ignored");
		$na--;
	    } else {
		logger("WARNING", "$fa retrieved but not in list!");
		$warncnt++;
	    }
	}
    }

    if ($nt > $na) {
	logger("WARNING", "less files were retrieved ($na vs. $nt)!");
	$warncnt++;
    } elsif ($na > $nt) {
	logger("WARNING", "more files were retrieved ($na vs. $nt)!");
	$warncnt++;
    }

}



#######################################################
# main
#

logger("INFO", "unarchiver $version");

# make shure the right user is running this program
my $user = getlogin || getpwuid($<);
if (($user)&&($user ne "archive")&&($user ne "root")) {
    logger("ABORT", "you must be archive or root user to run this program!");
    exit 1;
}

# use checkarchive first
if (system("$checkprog $docdir >/dev/null") == 0) {
    logger("INFO", "archive \"$docdir\" check OK");
} else {
    logger("ABORT", "archive \"$docdir\" check failed!!");
    exit 1;
}

# read index.meta file
my ($document, $rootnode) = read_xml($metafile);

# check index file
my $archived_files = read_resource_meta($rootnode);
my $num_archived_files = scalar keys %$archived_files;

# check for .archived file
if (-f "$docdir/.archived") {
    logger("INFO", ".archived file exists.");
} else {
    logger("WARNING", "no .archived file!");
    $warncnt++;
}

logger("INFO", "$num_archived_files files to retrieve.");

# save current index.meta
park_file($metafile); 

# retrieve
my %retrieved_files = ();
my $archcnt = 0;

if ($docdir =~ /\/mpiwg\/archive\/data\/(.*)/) {
    # TSM needs two different paths because of historical mount points :-(
    # try the new one first
    $archcnt = run_retrieve($docdir, "/mpiwg/archive", \%retrieved_files);
    if ($archcnt == 0) {
	# and then the old one
	$archcnt = run_retrieve($docdir, "/mpiwg/archive/data", \%retrieved_files);
    }
} else {
    # otherwise we assume that it works
    $archcnt += run_retrieve($docdir, $docdir, \%retrieved_files);
}

# restore current index.meta
unpark_file($metafile);

if ($archcnt == 0) {
    logger("ABORT", "no files retrieved!!");
    exit 1;
}
logger("INFO", "$archcnt files retrieved");

# check list of archived files
check_files($archived_files, \%retrieved_files);

# rewrite index.meta file
write_xml($document, $metafile);

logger("INFO", "$warncnt warnings");
logger("INFO", "$errcnt errors");
if ($errcnt == 0) {
    logger("DONE", "$archcnt archived files retrieved");
} else {
    logger("ABORT", "there were $errcnt errors!!");
    exit 1;
}

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>