view metacheck.pl @ 35:e27d097a9d87

still issues with "while read"
author casties
date Fri, 23 Jun 2006 11:40:12 +0200
parents 1dd183b95c61
children 2208ed7370cb
line wrap: on
line source

#!/usr/local/bin/perl -w

use strict;
use XML::LibXML;

use lib '/usr/local/mpiwg/archive';
use MPIWGStor;

# make output unbuffered
$|=1;

#######################################################
# internal parameters
#

# program version
my $version = "0.6.0 (20.9.2005)";
my $help = 
"use: metacheck [options] docdir
options:
  -debug  show debugging info
  -dry-run  simulate, dont'do anything
  -checkonly  leave existing index file untouched
  -add-files  add file tags for missing files
  -replace  rewrite index file to match current files
";
logger("INFO", "metacheck $version");

# read command line parameters
my $args = MPIWGStor::parseargs;
if (! scalar(%$args)) {
    print $help, "\n";
    exit 1;
}

# debug level
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;

# simulate action only
my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
logger('DEBUG', "dry-run: $dry_run");

# check only or fix index file also
my $check_only = (exists $$args{'checkonly'}) ? $$args{'checkonly'} : 1;

# add file tags for missing files
my $fix_files = ! $check_only;
# add tags for file size and date
my $fix_fs_meta = 1;
# add dir tags for missing directories
my $fix_dirs = ! $check_only;
# rewrite XML file (necessary for fix_files and fix_dirs)
my $fix_xml = ! $check_only;
# rewrite complete index file
my $do_rewrite = 0;

# add file tags for missing files
if (exists $$args{'add-files'}) {
    $check_only = 0;
    $fix_files = 1;
    $fix_dirs = 1;
    $fix_xml = 1;
    $do_rewrite = 0;
    logger('DEBUG', "add-files: true");
}

# completely rewrite index file
if (exists $$args{'replace'}) {
    $check_only = 0;
    $fix_files = 1;
    $fix_dirs = 1;
    $fix_xml = 1;
    $do_rewrite = 1;
    logger('DEBUG', "replace: true");
}
logger('DEBUG', "checkonly: $check_only");


my $xml_changed = 0;
# XML namespace (not really implemented!)
my $namespace = "";


#######################################################
# check parameters that were passed to the program
#
my $docdir = $$args{'path'};
if (! $docdir) {
    logger("ABORT", "no document directory given!");
    exit 1;
}
# strip double slashes
$docdir =~ s/\/\//\//;
# strip trailing slashes
$docdir =~ s/\/$//;
if (! -d $docdir) {
    logger("ABORT", "document directory \'$docdir\' doesn't exist!");
    exit 1;
}

my $metafile = "$docdir/index.meta";
if (! -f $metafile) {
    logger("ABORT", "metadata index file \'$metafile\' doesn't exist!");
    exit 1;
}

#######################################################
# internal variables
#

# all files in the document directory tree
my %files;
# all directories in the document directory tree
my %dirs;
# number of errors
my $errcnt = 0;
# number of warnings
my $warncnt = 0;

#######################################################
# subroutines
#

#
# fs_read_files($realdir, $docdir, \%files, \%dirs)
#
# reads all files and directories below $realdir and puts the
# files in %files and directories in %dirs
# $docdir is only for recursion, it should be empty when called 
# from outside
#
sub fs_read_files {
    my ($directory, $docdir, $files, $dirs) = @_;    
    my $cnt = 0;

    if (! opendir DIR, $directory) {
	return 0;
    }
    my @dirfiles = readdir DIR;
    foreach my $fn (@dirfiles) {
	# ignore names starting with a dot
	next if ($fn =~ /^\./);
	# ignore other silly files
	next if ($junk_files{$fn});

	$cnt++;
	my $f = "$directory/$fn";
	my $docf = ($docdir) ? "$docdir/$fn" : $fn;
	#logger("fs_file: \"$f\"");
	if (-f $f) {
	    #logger("  is file");
	    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
                      $atime,$mtime,$ctime,$blksize,$blocks)
                          = stat(_); 
	    $$files{$docf} = [$fn, $size, $mtime];
	} elsif (-d _) {
	    #logger("  is dir");
	    $$dirs{$docf} = $fn;
	    # recurse into directory
	    $cnt += fs_read_files($f, $docf, $files, $dirs);
	}
    }
    return $cnt;
}


#
# check_dirs($rootnode, \%dirs)
#
# reads all dir elements under $rootnode and compares with the directory list
# in %dirs
#
sub check_dirs {
    my ($root, $fsdirs) = @_;

    #
    # iterate all dir tags
    #
    my @dirnodes = $root->findnodes('child::dir');
    my %okdirs;
    foreach my $dirnode (@dirnodes) {
	my $dirname = sstrip($dirnode->find('child::name'));
	my $dirpath = sstrip($dirnode->find('child::path'));
	my $description = sstrip($dirnode->find('child::description'));
	# name must be valid
	if (! valid_dir_name($dirname)) {
	    if ($do_rewrite) {
		logger("WARNING", "directory name ($dirpath) $dirname in index file invalid!");
		$warncnt++;
	    } else {
		logger("ERROR", "directory name ($dirpath) $dirname invalid!");
		$errcnt++;
	    }
	}
	# description can be present
	if (! $description) {
	    logger("WARNING", "description for directory $dirname (in $dirpath/) missing!");
	    $warncnt++;
	}
	# check with dirs on filesystem 
	my $fn;
	if ($dirpath && ($dirpath ne '.')) {
	    $fn = "$dirpath/$dirname";
	} else {
	    $fn = "$dirname";
	}
        #logger('DEBUG', "dir: \"$dirname\", \"$dirpath\", fn: \"$fn\"");
	if ($$fsdirs{$fn}) {
	    #logger("  OK ($$fsdirs{$fn})");
	    $okdirs{$fn} = $dirname;
	} else {
	    if ($do_rewrite) {
		# remove dir tag
		logger("WARNING", "directory $dirname (in $dirpath/) no longer on disk!");
		$dirnode->unbindNode();
		$warncnt++;
	    } else {
		logger("ERROR", "directory $dirname (in $dirpath/) missing on disk!");
		$errcnt++;
	    }
	}
    }
    #logger("dirs: ", (scalar keys %$fsdirs), " vs ", (scalar keys %okdirs), "");
    if ((scalar keys %$fsdirs) != (scalar keys %okdirs)) {
	# number of dir tags and dirs don't match
	# iterate through all dirs
	foreach my $f (sort keys %$fsdirs) {
	    # was this dir missing?
	    if (! $okdirs{$f}) {
		my ($name, $path) = split_file_path($f, 1);
		# name must be valid
		if (! valid_dir_name($name)) {
		    $path = "." unless ($path);
		    logger("ERROR", "directory name $name (in $path/) invalid!");
		    $errcnt++;
		    next;
		}
		if ($fix_dirs) {
		    # add missing dir tag
		    my $dir_node = $root->addNewChild($namespace, "dir");
		    $xml_changed++;
		    # add name
		    my $name_node = $dir_node->addNewChild($namespace, "name");
		    $name_node->appendTextNode($name);
		    # add path
		    if ($path) {
			my $path_node = $dir_node->addNewChild($namespace, "path");
			$path_node->appendTextNode($path);
		    }
		    logger("INFO", "directory $f to be added to index file!");
		} else {
		    logger("ERROR", "directory $f missing in index file!");
		    $errcnt++;
		}
	    }
	}
    }
}


#
# check_files($rootnode, \%files)
#
# reads all file elements under $rootnode and compares with the file list
# in %files
#
sub check_files {
    my ($root, $fsfiles) = @_;

    #
    # iterate all file tags
    #
    my @filenodes = $root->findnodes('child::file');
    my %okfiles;
    foreach my $filenode (@filenodes) {
	my $filename = sstrip($filenode->find('child::name'));
	my $filepath = sstrip($filenode->find('child::path'));
	my $filesize = sstrip($filenode->find('child::size'));
	my $filedate = sstrip($filenode->find('child::date'));
	# name must be valid
	if (! valid_file_name($filename)) {
	    if ($do_rewrite) {
		logger("WARNING", "file name ($filepath)$filename in index file invalid!");
		$warncnt++;
	    } else {
		logger("ERROR", "file name ($filepath)$filename invalid!");
		$errcnt++;
	    }
	}
	my $fn = ($filepath) ? "$filepath/$filename" : "$filename";
        #logger("file: \"$filename\", \"$filepath\"");
	if ($$fsfiles{$fn}) {
	    #logger("  OK ($$fsfiles{$fn})");
	    $okfiles{$fn} = $filename;
	    # check file size and date
	    if ($filesize) {
		if ($filesize != $$fsfiles{$fn}->[1]) {
		    logger("WARNING", "size of file $fn changed: $filesize to $$fsfiles{$fn}->[1]");
		    $warncnt++;
		}
	    }
	    # file date
	    if ($filedate) {
		if ($filedate ne stime($$fsfiles{$fn}->[2])) {
		    logger("WARNING", "date of file $fn changed: $filedate to ", stime($$fsfiles{$fn}->[2]), "");
		    $warncnt++;
		}
	    }
	    # update file size and date
	    if ($fix_fs_meta) {
		# delete size and date
		foreach my $n ($filenode->findnodes('child::size')) {
		    $filenode->removeChild($n);
		}
		foreach my $n ($filenode->findnodes('child::date')) {
		    $filenode->removeChild($n);
		}
		# add new size and date
		my $node = $filenode->addNewChild($namespace, "size");
		$node->appendTextNode($$fsfiles{$fn}->[1]);
		$node = $filenode->addNewChild($namespace, "date");
		$node->appendTextNode(stime($$fsfiles{$fn}->[2]));
		$xml_changed++;
	    }
	} else {
	    if ($do_rewrite) {
		# remove file tag
		logger("WARNING", "file $filename (in $filepath/) no longer on disk!");
		$filenode->unbindNode();
		$warncnt++;
	    } else {
		logger("ERROR", "file $filename (in $filepath/) missing on disk!");
		$errcnt++;
	    }
	}
    }
    #logger("files: ", (scalar keys %$fsfiles), " vs ", (scalar keys %okfiles), "");
    if ((scalar keys %$fsfiles) != (scalar keys %okfiles)) {
	# number of file tags and files don't match
	# iterate through all files
	foreach my $f (sort keys %$fsfiles) {
	    my ($name, $path) = split_file_path($f, 1);
	    # was this file missing?
	    if (! $okfiles{$f}) {
		# is an ignoreable file?
		if ($index_files{$name}) {
		    next;
		}
		# name must be valid
		if (! valid_dir_name($name)) {
		    $path = "." unless ($path);
		    logger("ERROR", "file name $name (in $path/) invalid!");
		    $errcnt++;
		    next;
		}
		if ($fix_files) {
		    # add missing file tag
		    my $file_node = $root->addNewChild($namespace, "file");
		    $xml_changed++;
		    # add name
		    my $name_node = $file_node->addNewChild($namespace, "name");
		    $name_node->appendTextNode($name);
		    # add path
		    if ($path) {
			my $path_node = $file_node->addNewChild($namespace, "path");
			$path_node->appendTextNode($path);
		    }
		    # add size
		    my $size_node = $file_node->addNewChild($namespace, "size");
		    $size_node->appendTextNode($$fsfiles{$f}->[1]);
		    # add date
		    my $date_node = $file_node->addNewChild($namespace, "date");
		    $date_node->appendTextNode(stime($$fsfiles{$f}->[2]));
		    logger("INFO", "file $f to be added to index file!");
		} else {
		    logger("ERROR", "file $f missing in index file!");
		    $errcnt++;
		}
	    }
	}
    }
}

#
# check_resource_meta($rootnode)
#
# checks general resource meta information
#
sub check_resource_meta {
    my ($rootnode) = @_;

    #
    # description
    #
    my $description = $rootnode->findvalue('child::description');
    if (! $description) {
	logger("ERROR", "resource description element missing!");
	$errcnt++;
    }
    #
    # name
    #
    my $name = sstrip($rootnode->findvalue('child::name'));
    if ($name) {
	my ($dirname, $dirpath) = split_file_path($docdir);
	if ($dirname ne $name) {
	    logger("ERROR", "resource name element '$name' does not match directory name '$dirname'!");
	    $errcnt++;
	}
    } else {
	logger("ERROR", "resource name element missing!");
	$errcnt++;
    }
    #
    # archive path
    #
    my $realpath;
    # get real path
    if ($docdir =~ /^\//) {
	# docdir is absolute
	$realpath = $docdir;
    } else {
	# docdir is relative -- try with the shell
	if (open PWDCMD, "cd $docdir ; pwd|") {
	    $realpath = <PWDCMD>;
	    chomp $realpath;
	}
    }
    if (! $realpath) {
	logger("ERROR", "unable to check real archive path!");
	$errcnt++;
	return;
    }
    # get archive-path
    my $archnode = ($rootnode->find('child::archive-path'))->get_node(1);
    if ($archnode) {
	my $arch = sstrip($archnode->textContent);
	if ($arch ne $realpath) {
	    logger("WARNING", "incorrect archive-path '$arch' will be changed to '$realpath'!");
	    $warncnt++;
	    # correct archive-path
	    $archnode->removeChildNodes;
	    $archnode->appendTextNode($realpath);
	    $xml_changed++;
	}
    } else {
	# add archive-path
	$archnode = $rootnode->addNewChild($namespace, "archive-path");
	$archnode->appendTextNode($realpath);
	$xml_changed++;
    }

}

    

#######################################################
# main
#

my ($document, $rootnode) = read_xml($metafile);

check_resource_meta($rootnode);

my $fnum = fs_read_files($docdir, "", \%files, \%dirs);
logger("INFO", "$fnum files on FS");
#foreach (keys %dirs) {logger('DEBUG', "  dir ($_): $dirs{$_}");}

check_files($rootnode, \%files);
check_dirs($rootnode, \%dirs);

logger("INFO", "$warncnt warnings");
logger("INFO", "$errcnt errors");
if ($errcnt > 0) {
    logger("ABORT", "there were $errcnt errors!");
    exit 1;
} else {
    if ($fix_xml) {
	if ($dry_run) {
	    logger('INFO', "would write $metafile");
	    logger('DEBUG', $document->toString(1));
	} else {
	    write_xml($document, $metafile);
	}
    }
    logger("DONE", "index file checked successfully!");
}