view metacheck.pl @ 18:fdf4ceb36db1

fixed problem with dir names in metacheck new version of metacheck defaults to not change index file new version of archiver uses new version of metacheck
author casties
date Tue, 20 Sep 2005 19:24:57 +0200
parents b7259a1c85aa
children 1dd183b95c61
line wrap: on
line source

#!/usr/local/bin/perl -w

use strict;
use XML::LibXML;

use lib '/usr/local/mpiwg/archive';
use MPIWGStor;

# make output unbuffered
$|=1;

#######################################################
# internal parameters
#

# program version
my $version = "0.6.0 (20.9.2005)";
my $help = 
"use: metacheck [options] docdir
options:
  -debug  show debugging info
  -dry-run  simulate, dont'do anything
  -checkonly  leave existing index file untouched
  -add-files  add file tags for missing files
  -replace  rewrite index file to match current files
";
logger("INFO", "metacheck $version");

# read command line parameters
my $args = MPIWGStor::parseargs;
if (! scalar(%$args)) {
    print $help, "\n";
    exit 1;
}

# debug level
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;

# simulate action only
my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
logger('DEBUG', "dry-run: $dry_run");

# check only or fix index file also
my $check_only = (exists $$args{'checkonly'}) ? $$args{'checkonly'} : 1;

# add file tags for missing files
my $fix_files = ! $check_only;
# add tags for file size and date
my $fix_fs_meta = 1;
# add dir tags for missing directories
my $fix_dirs = ! $check_only;
# rewrite XML file (necessary for fix_files and fix_dirs)
my $fix_xml = ! $check_only;
# rewrite complete index file
my $do_rewrite = 0;

# add file tags for missing files
if (exists $$args{'add-files'}) {
    $check_only = 0;
    $fix_files = 1;
    $fix_dirs = 1;
    $fix_xml = 1;
    $do_rewrite = 0;
    logger('DEBUG', "add-files: true");
}

# completely rewrite index file
if (exists $$args{'replace'}) {
    $check_only = 0;
    $fix_files = 1;
    $fix_dirs = 1;
    $fix_xml = 1;
    $do_rewrite = 1;
    logger('DEBUG', "replace: true");
}
logger('DEBUG', "checkonly: $check_only");


my $xml_changed = 0;
# XML namespace (not really implemented!)
my $namespace = "";


#######################################################
# check parameters that were passed to the program
#
my $docdir = $$args{'path'};
if (! $docdir) {
    logger("ABORT", "no document directory given!");
    exit 1;
}
# strip double slashes
$docdir =~ s/\/\//\//;
# strip trailing slashes
$docdir =~ s/\/$//;
if (! -d $docdir) {
    logger("ABORT", "document directory \'$docdir\' doesn't exist!");
    exit 1;
}

my $metafile = "$docdir/index.meta";
if (! -f $metafile) {
    logger("ABORT", "metadata index file \'$metafile\' doesn't exist!");
    exit 1;
}

#######################################################
# internal variables
#

# all files in the document directory tree
my %files;
# all directories in the document directory tree
my %dirs;
# number of errors
my $errcnt = 0;
# number of warnings
my $warncnt = 0;

#######################################################
# subroutines
#

#
# fs_read_files($realdir, $docdir, \%files, \%dirs)
#
# reads all files and directories below $realdir and puts the
# files in %files and directories in %dirs
# $docdir is only for recursion, it should be empty when called 
# from outside
#
sub fs_read_files {
    my ($directory, $docdir, $files, $dirs) = @_;    
    my $cnt = 0;

    if (! opendir DIR, $directory) {
	return 0;
    }
    my @dirfiles = readdir DIR;
    foreach my $fn (@dirfiles) {
	# ignore names starting with a dot
	next if ($fn =~ /^\./);
	# ignore other silly files
	next if ($junk_files{$fn});

	$cnt++;
	my $f = "$directory/$fn";
	my $docf = ($docdir) ? "$docdir/$fn" : $fn;
	#logger("fs_file: \"$f\"");
	if (-f $f) {
	    #logger("  is file");
	    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
                      $atime,$mtime,$ctime,$blksize,$blocks)
                          = stat(_); 
	    $$files{$docf} = [$fn, $size, $mtime];
	} elsif (-d _) {
	    #logger("  is dir");
	    $$dirs{$docf} = $fn;
	    # recurse into directory
	    $cnt += fs_read_files($f, $docf, $files, $dirs);
	}
    }
    return $cnt;
}


#
# check_dirs($rootnode, \%dirs)
#
# reads all dir elements under $rootnode and compares with the directory list
# in %dirs
#
sub check_dirs {
    my ($root, $fsdirs) = @_;

    #
    # iterate all dir tags
    #
    my @dirnodes = $root->findnodes('child::dir');
    my %okdirs;
    foreach my $dirnode (@dirnodes) {
	my $dirname = sstrip($dirnode->find('child::name'));
	my $dirpath = sstrip($dirnode->find('child::path'));
	my $description = sstrip($dirnode->find('child::description'));
	# name must be valid
	if (! valid_dir_name($dirname)) {
	    logger("ERROR", "directory name ($dirpath) $dirname invalid!");
	    $errcnt++;
	}
	# description can be present
	if (! $description) {
	    logger("WARNING", "description for directory $dirname (in $dirpath/) missing!");
	    $warncnt++;
	}
	# check with dirs on filesystem 
	my $fn;
	if ($dirpath && ($dirpath ne '.')) {
	    $fn = "$dirpath/$dirname";
	} else {
	    $fn = "$dirname";
	}
        #logger('DEBUG', "dir: \"$dirname\", \"$dirpath\", fn: \"$fn\"");
	if ($$fsdirs{$fn}) {
	    #logger("  OK ($$fsdirs{$fn})");
	    $okdirs{$fn} = $dirname;
	} else {
	    if ($do_rewrite) {
		# remove dir tag
		logger("WARNING", "directory $dirname (in $dirpath/) no longer on disk!");
		$dirnode->unbindNode();
		$warncnt++;
	    } else {
		logger("ERROR", "directory $dirname (in $dirpath/) missing on disk!");
		$errcnt++;
	    }
	}
    }
    #logger("dirs: ", (scalar keys %$fsdirs), " vs ", (scalar keys %okdirs), "");
    if ((scalar keys %$fsdirs) != (scalar keys %okdirs)) {
	# number of dir tags and dirs don't match
	# iterate through all dirs
	foreach my $f (sort keys %$fsdirs) {
	    # was this dir missing?
	    if (! $okdirs{$f}) {
		my ($name, $path) = split_file_path($f, 1);
		# name must be valid
		if (! valid_dir_name($name)) {
		    $path = "." unless ($path);
		    logger("ERROR", "directory name $name (in $path/) invalid!");
		    $errcnt++;
		    next;
		}
		if ($fix_dirs) {
		    # add missing dir tag
		    my $dir_node = $root->addNewChild($namespace, "dir");
		    $xml_changed++;
		    # add name
		    my $name_node = $dir_node->addNewChild($namespace, "name");
		    $name_node->appendTextNode($name);
		    # add path
		    if ($path) {
			my $path_node = $dir_node->addNewChild($namespace, "path");
			$path_node->appendTextNode($path);
		    }
		    logger("INFO", "directory $f to be added to index file!");
		} else {
		    logger("ERROR", "directory $f missing in index file!");
		    $errcnt++;
		}
	    }
	}
    }
}


#
# check_files($rootnode, \%files)
#
# reads all file elements under $rootnode and compares with the file list
# in %files
#
sub check_files {
    my ($root, $fsfiles) = @_;

    #
    # iterate all file tags
    #
    my @filenodes = $root->findnodes('child::file');
    my %okfiles;
    foreach my $filenode (@filenodes) {
	my $filename = sstrip($filenode->find('child::name'));
	my $filepath = sstrip($filenode->find('child::path'));
	my $filesize = sstrip($filenode->find('child::size'));
	my $filedate = sstrip($filenode->find('child::date'));
	# name must be valid
	if (! valid_file_name($filename)) {
	    logger("ERROR", "file name ($filepath)$filename invalid!");
	    $errcnt++;
	}
	my $fn = ($filepath) ? "$filepath/$filename" : "$filename";
        #logger("file: \"$filename\", \"$filepath\"");
	if ($$fsfiles{$fn}) {
	    #logger("  OK ($$fsfiles{$fn})");
	    $okfiles{$fn} = $filename;
	    # check file size and date
	    if ($filesize) {
		if ($filesize != $$fsfiles{$fn}->[1]) {
		    logger("WARNING", "size of file $fn changed: $filesize to $$fsfiles{$fn}->[1]");
		    $warncnt++;
		}
	    }
	    # file date
	    if ($filedate) {
		if ($filedate ne stime($$fsfiles{$fn}->[2])) {
		    logger("WARNING", "date of file $fn changed: $filedate to ", stime($$fsfiles{$fn}->[2]), "");
		    $warncnt++;
		}
	    }
	    # update file size and date
	    if ($fix_fs_meta) {
		# delete size and date
		foreach my $n ($filenode->findnodes('child::size')) {
		    $filenode->removeChild($n);
		}
		foreach my $n ($filenode->findnodes('child::date')) {
		    $filenode->removeChild($n);
		}
		# add new size and date
		my $node = $filenode->addNewChild($namespace, "size");
		$node->appendTextNode($$fsfiles{$fn}->[1]);
		$node = $filenode->addNewChild($namespace, "date");
		$node->appendTextNode(stime($$fsfiles{$fn}->[2]));
		$xml_changed++;
	    }
	} else {
	    if ($do_rewrite) {
		# remove file tag
		logger("WARNING", "file $filename (in $filepath/) no longer on disk!");
		$filenode->unbindNode();
		$warncnt++;
	    } else {
		logger("ERROR", "file $filename (in $filepath/) missing on disk!");
		$errcnt++;
	    }
	}
    }
    #logger("files: ", (scalar keys %$fsfiles), " vs ", (scalar keys %okfiles), "");
    if ((scalar keys %$fsfiles) != (scalar keys %okfiles)) {
	# number of file tags and files don't match
	# iterate through all files
	foreach my $f (sort keys %$fsfiles) {
	    my ($name, $path) = split_file_path($f, 1);
	    # was this file missing?
	    if (! $okfiles{$f}) {
		# is an ignoreable file?
		if ($index_files{$name}) {
		    next;
		}
		# name must be valid
		if (! valid_dir_name($name)) {
		    $path = "." unless ($path);
		    logger("ERROR", "file name $name (in $path/) invalid!");
		    $errcnt++;
		    next;
		}
		if ($fix_files) {
		    # add missing file tag
		    my $file_node = $root->addNewChild($namespace, "file");
		    $xml_changed++;
		    # add name
		    my $name_node = $file_node->addNewChild($namespace, "name");
		    $name_node->appendTextNode($name);
		    # add path
		    if ($path) {
			my $path_node = $file_node->addNewChild($namespace, "path");
			$path_node->appendTextNode($path);
		    }
		    # add size
		    my $size_node = $file_node->addNewChild($namespace, "size");
		    $size_node->appendTextNode($$fsfiles{$f}->[1]);
		    # add date
		    my $date_node = $file_node->addNewChild($namespace, "date");
		    $date_node->appendTextNode(stime($$fsfiles{$f}->[2]));
		    logger("INFO", "file $f to be added to index file!");
		} else {
		    logger("ERROR", "file $f missing in index file!");
		    $errcnt++;
		}
	    }
	}
    }
}

#
# check_resource_meta($rootnode)
#
# checks general resource meta information
#
sub check_resource_meta {
    my ($rootnode) = @_;

    #
    # description
    #
    my $description = $rootnode->findvalue('child::description');
    if (! $description) {
	logger("ERROR", "resource description element missing!");
	$errcnt++;
    }
    #
    # name
    #
    my $name = sstrip($rootnode->findvalue('child::name'));
    if ($name) {
	my ($dirname, $dirpath) = split_file_path($docdir);
	if ($dirname ne $name) {
	    logger("ERROR", "resource name element '$name' does not match directory name '$dirname'!");
	    $errcnt++;
	}
    } else {
	logger("ERROR", "resource name element missing!");
	$errcnt++;
    }
    #
    # archive path
    #
    my $realpath;
    # get real path
    if ($docdir =~ /^\//) {
	# docdir is absolute
	$realpath = $docdir;
    } else {
	# docdir is relative -- try with the shell
	if (open PWDCMD, "cd $docdir ; pwd|") {
	    $realpath = <PWDCMD>;
	    chomp $realpath;
	}
    }
    if (! $realpath) {
	logger("ERROR", "unable to check real archive path!");
	$errcnt++;
	return;
    }
    # get archive-path
    my $archnode = ($rootnode->find('child::archive-path'))->get_node(1);
    if ($archnode) {
	my $arch = sstrip($archnode->textContent);
	if ($arch ne $realpath) {
	    logger("WARNING", "incorrect archive-path '$arch' will be changed to '$realpath'!");
	    $warncnt++;
	    # correct archive-path
	    $archnode->removeChildNodes;
	    $archnode->appendTextNode($realpath);
	    $xml_changed++;
	}
    } else {
	# add archive-path
	$archnode = $rootnode->addNewChild($namespace, "archive-path");
	$archnode->appendTextNode($realpath);
	$xml_changed++;
    }

}

    

#######################################################
# main
#

my ($document, $rootnode) = read_xml($metafile);

check_resource_meta($rootnode);

my $fnum = fs_read_files($docdir, "", \%files, \%dirs);
logger("INFO", "$fnum files on FS");
#foreach (keys %dirs) {logger('DEBUG', "  dir ($_): $dirs{$_}");}

check_files($rootnode, \%files);
check_dirs($rootnode, \%dirs);

logger("INFO", "$warncnt warnings");
logger("INFO", "$errcnt errors");
if ($errcnt > 0) {
    logger("ABORT", "there were $errcnt errors!");
    exit 1;
} else {
    if ($fix_xml) {
	if ($dry_run) {
	    logger('INFO', "would write $metafile");
	    logger('DEBUG', $document->toString(1));
	} else {
	    write_xml($document, $metafile);
	}
    }
    logger("DONE", "index file checked successfully!");
}