head 1.6; access; symbols; locks mah:1.6; strict; comment @# @; 1.6 date 2005.05.03.19.06.05; author MarkHillebrand; state Exp; branches; next 1.5; 1.5 date 2005.04.13.11.45.24; author MarkHillebrand; state Exp; branches; next 1.4; 1.4 date 2005.04.13.08.41.58; author MarkHillebrand; state Exp; branches; next 1.3; 1.3 date 2005.02.13.19.45.25; author MarkHillebrand; state Exp; branches; next 1.2; 1.2 date 2005.02.13.18.45.02; author MarkHillebrand; state Exp; branches; next 1.1; 1.1 date 2004.07.23.21.23.43; author MarkHillebrand; state Exp; branches; next ; desc @none @ 1.6 log @cosmetic @ text @#!/usr/bin/perl -w # vim:fdm=marker:commentstring=#\ %s: # TODO deletion of local files / topics on their deletion / movement # TODO running under --rcs should add missing RCS files for non-updated topics # TODO generate (not mirror) the latest version from the RCS file when running under the --rcs switch # TODO add a warning if --url ends in slash and / or contains /bin/view or smth use LWP::UserAgent; use Getopt::Long; use IO::Handle; use Pod::Usage; # {{{ set up RequestAgent for credential handling # slighly modified from lwp-request: { package RequestAgent; @@ISA = qw(LWP::UserAgent); sub new { my $passwd_file = splice @@_,1,1; my $self = LWP::UserAgent::new(@@_); $self->agent("mirror-twiki-attachments"); if( defined $passwd_file ) { my $fh = IO::Handle->new; if( open $fh, "<$passwd_file" ) { warn "Warning: `$passwd_file' has group or world permissions\n" if (stat($passwd_file))[2] & 0077; my %a; while( $_ = <$fh> ) { # add info to current entry /^port\s+(.*)/ and $a{port} = $1 or /^proxy/ and $a{proxy} = 1 or /^proxy/ and $a{proxy} = 1 or /^path/ and $a{path} = $. or /^realm\s+(.*)/ and $a{realm} = $1 or /^login\s+(.*)/ and $a{login} = $1 or /^(?:passwd|password)\s+(.*)/ and $a{password} = $1 or /^(?:machine|host)/ or # (handled later) /^\s*$/ or warn "?Cannot parse line $. in `$passwd_file': $_"; # add authentication for completed entry if( /^(?:machine|host)\s+(.*)/ or eof($fh) ) { if( $a{host} and not $a{proxy}) { $a{port} ||= 80; #$a{login} ||= $ENV{USER}; # good idea? warn "Ignoring path entry in `$passwd_file', line $a{path}\n" if $a{path}; if( $a{realm} and $a{password} ) { $self->credentials("$a{host}:$a{port}",$a{realm},$a{login},$a{password}); } else { warn "?Incomplete authentication entry in `$passwd_file', line $a{line}\n"; } } }; # start new entry /^(?:machine|host)\s+(.*)/ and %a = (host => $1, line => $.); } } else { warn "?Cannot open `$passwd_file': $!\n"; } } $self; } sub get_basic_credentials { my($self, $realm, $uri) = @@_; my $netloc = $uri->host_port; if (exists $self->{basic_authentication}{$netloc}{$realm}) { # return cached token return @@{ $self->{basic_authentication}{$netloc}{$realm} }; } else { print "Enter username for $realm at $netloc: "; my $user = ; chomp($user); return (undef, undef) unless length $user; print "Password: "; system("stty -echo"); my $password = ; system("stty echo"); print "\n"; # because we disabled echo chomp($password); $self->credentials($netloc,$realm,$user,$password); # store info return ($user, $password); } } } # }}} # {{{ option processing my %o; unshift @@ARGV, map "--$_", split "\n", `cat .mirror-twiki-attachments.conf` if -r '.mirror-twiki-attachments.conf'; pod2usage(1) if @@ARGV==0; GetOptions(\%o, qw(help|h passwd_file=s only-rcs rcs twiki|url=s verbose|v web=s)) or pod2usage(1); pod2usage(2) unless $o{twiki} and $o{web}; pod2usage(1) if $o{help}; $o{rcs} ||= $o{'only-rcs'}; # --only-rcs implies --rcs # }}} # {{{ safety measure against clobbering up wrong directories if( !-r "topicchangedates.txt" ) { print "?no `topicchangedates.txt' file found here, please create an empty one.\n"; print "(note: this directory isn't empty, so you are maybe at the wrong place anyway!)\n" if glob "*"; exit 1; } # }}} # {{{ fetch a topic index page of the web my $marker = 'topic'; # well... improve matching quality ;) my $search = "$o{twiki}/bin/search/$o{web}/?search=.%2A®ex=on&format=|$marker|%3Cnop%3E\$topic|\$isodate|&nosearch=on&nosummary=on¬otal=on"; # ^ would LWP escape this url for us? my $ua = RequestAgent->new($o{passwd_file}); my $req = HTTP::Request->new(GET => $search); $res = $ua->request($req); if ($res->is_success) { $searchpage = $res->as_string; } else { die "Failed: ", $res->status_line, "\n"; } # TODO possibly check for TWiki oopses... my %olddates = `cat topicchangedates.txt` =~ m/(.*)\t(.*)/gm; my %currentdates = $searchpage =~ m{^]*>]*> \Q$marker\E ]*> (\w+) ]*> (\d{4}-\d{2}-\d{2}.\d{2}:\d{2})}gm; # }}} # {{{ classify topic into new, updated and deleted for (keys %currentdates) { if( !exists $olddates{$_} ) { push @@NEW, $_; } else { push @@UPDATED, $_ if $currentdates{$_} gt $olddates{$_}; # ne instead of gt would suffice... } } for (keys %olddates) { if( !exists $currentdates{$_} ) { push @@DELETED, $_; } } # }}} # {{{ schedule the attachments of new and updated topics for mirroring print "!scanning new and updated pages for attachments...\n"; for $topic (@@NEW,@@UPDATED) { my $attachpage; print "$o{web}.$topic\n" if $o{verbose}; $req->uri("$o{twiki}/bin/attach/$o{web}/$topic"); $res = $ua->request($req); if ($res->is_success) { $attachpage = $res->as_string; for $attachment ($attachpage =~ m{^\1}mg ) { $currentattachments{"$topic/$attachment"} = "$o{twiki}/pub/$o{web}/$topic/$attachment" unless $o{'only-rcs'}; $currentattachments{"$topic/$attachment,v"} = "$o{twiki}/pub/$o{web}/$topic/$attachment,v" if $o{rcs}; # create the directory if it does not exists (will check every time we arrive here) mkdir $topic if !-d $topic; } } else { print "?Could not fetch page for $topic, skipping\n"; } } # }}} # {{{ mirror now... print "!mirroring found attachments\n"; while( ($file,$url) = each %currentattachments ) { # attachments are real files on the server # so, mirror() should be okay to update them # (assuming, that the topicchangedate mechanism above cuts down # the number of scanned pages significantly anyway) $res = $ua->mirror($url,$file); if ($res->is_success) { print "!fetched $file\n"; } elsif( $res->code!=304) { # not modified... print "?Could not mirror $file ($url):" . $res->status_line . "\n"; } else { print "Not modified: $file\n" if $o{verbose}; } } # }}} # {{{ scan the @@NEW and @@UPDATED directories for superfluous attachments... # rename them with a trailing tilde # (possibly @@UPDATED would suffice... who cares, it's only local stuff) for $dir (@@NEW,@@UPDATED) { for $file (glob "$dir/*") { if( $file !~ /~$/ and !exists $currentattachments{$file} ) { print "!renaming $file to $file~\n"; rename $file, "$file~"; } } } # }}} # {{{ indicate deleted topics if( @@DELETED ) { print "!The following topics are not present anymore remotely:\n\t@@DELETED\nConsider deleting them locally\n"; } # }}} # {{{ store the topic dates that we have seen open O, ">topicchangedates.txt"; print O "$_\t$currentdates{$_}\n" for sort keys %currentdates; close O; # }}} __END__ =head1 NAME mirror-twiki-attachments - mirror attachments of a TWiki web =head1 SYNOPSIS B [B<--help>] [B<--passwd_file> F] [B<--twiki> F] [B<--verbose>] [B<--web> F] =head1 DESCRIPTION mirror-twiki-attachments visits the specified TWiki web and stores all attachments of all topics in the current, local directory. The script tries to be friendly on the server by first requesting an index page of the topics in the target web and comparing the change dates listed on that page with the last known change dates stored in the file F. Then, it visits the attachment tables of the updated or new topics and constructs a list of attachments which are to be mirrored. Afterwards, it mirrors them (i.e. it downloads them if they are newer than a local copy, if available). Note that this procedure applies to (and is of interest to) TWiki configurations which do not have the /pub directory browsable. For the first-time mirroring of a directory, create a new directory and create an empty F file in it. Options may be specified on command-line and given in the configuration file F<.mirror-twiki-attachments.conf> in the current directory; one option per line, option arguments introduced by an equality sign. =head1 OPTIONS =over 4 =item B<--help>, B<-h> Print (this) help on options. =item B<--passwd_file> F Use the F for authentication information; the file format of that file is the same as for the w3m browser. =item B<--only-rcs> Download only RCS files on the TWiki server. I RCS files may get very big, especially for binary files with many versions. =item B<--rcs> Download the RCS files on the TWiki server as well. I RCS files may get very big, especially for binary files with many versions. =item B<--twiki> F The URL of the TWiki server. =item B<--verbose>, B<-v> Be more verbose while mirroring. =item B<--web> F The name of the web to mirror. =back =head1 EXAMPLES The file format of the password file is compatible with that of the w3m browser. An example of a host entry in that file is given here: host www.twiki-host.domain port 443 realm Enter your WikiName: (First name and last name; no space; no dots; capitalized; e.g. JohnSmith). Cancel to register if you do not have one. login WikiName password seCRet The port defaults to 80 if not given. The line with the realm option has been split for readibility; it must appear as a single line in the password file. =head1 BUGS AND CAVEATS For SSL access, IO::Socket::SSL or a similar package has to be installed. Standard directory layout of the TWiki is assumed. Local deletion of directories won't be detected until topic is updated Topic dates are updated even if mirroring failed. A directory for a topic is created even if this topic has no attachment. Local deletion of a topic will not be detected until the topic is updated on the server. Attachments that are deleted or moved in the TWiki web are not deleted locally but rather renamed adding a tilde suffix. Also, deleted topics are only reported but not deleted locally. Thus, real deletion has to be done manually (e.g., to delete all backup files, you may execute C). =head1 COPYRIGHT (C) 2004-2005 Mark Hillebrand . This code is released under the BSD License. Before using this software, visit L for the full license text. =head1 SEE ALSO L, L @ 1.5 log @added --only-rcs option @ text @d6 1 d93 3 a95 3 GetOptions(\%o, qw(help|h passwd_file=s only-rcs rcs twiki|url=s verbose|v web=s)) or pod2usage(2); pod2usage(1) unless $o{twiki} and $o{web}; pod2usage(2) if $o{help}; d298 1 a298 1 (C) 2004-2005 Mark Hillebrand. a302 4 =head1 AUTHOR Mark A. Hillebrand @ 1.4 log @added --rcs option, slight doc changes @ text @d92 1 a92 1 GetOptions(\%o, qw(help|h passwd_file=s rcs twiki|url=s verbose|v web=s)) or pod2usage(2); d95 1 d118 1 d146 1 a146 1 $currentattachments{"$topic/$attachment"} = "$o{twiki}/pub/$o{web}/$topic/$attachment"; d235 5 @ 1.3 log @none @ text @d3 3 a5 1 # TODO deletion... a7 1 # for ssl you need something like libio-socket-ssl-perl installed additionally! d25 1 a25 1 warn "?Warning: `$passwd_file' has group or world permissions\n" d37 2 a38 1 /^(?:machine|host)/ or /^\s*$/ or warn "?Cannot parse line $. in `$passwd_file': $_"; a86 1 # ($o{twiki},$o{web},$o{verbose},$o{passwd_file}); d92 1 a92 7 GetOptions(\%o, 'help|h', 'passwd_file=s', 'twiki|url=s', 'verbose|v', 'web=s', ) or pod2usage(2); d145 1 d233 5 d250 2 d283 2 a284 2 Old versions of an attachment are renamed, adding a tilde suffix. Deleted topics are reported but not deleted locally. d290 1 a290 2 This code is release under the BSD License. Before using this software, visit http://www.opensource.org/licenses/bsd-license.php for the full license text. d292 2 a293 1 (C) 2004-2005 Mark Hillebrand. d301 1 a301 1 L, @ 1.2 log @passwd_file, configuration file @ text @d2 1 a2 9 # (C) 2004, Mark A. Hillebrand # Purpose: mirror all attachments in a twiki web locally # (hopefully being friendly on remote host) # Usage: mirror-twiki-attachments --url http://the.host/twiki --web Theweb # (in a directory with an empty topicchangedates.txt file which is # otherwise empty) # Note: currently, this script is careful not to delete stuff. # use at your own risk. # a3 4 # TODO failures should prevent the topic date from being updated... # TODO local deletion of directories won't be detected until topic is updated # TODO should only create the directories that actually have attachments... # TODO switch to option array d9 1 d35 2 a36 1 /^(?:passwd|password)\s+(.*)/ and $a{password} = $1; d84 2 a85 1 my ($twiki,$web,$verbose,$passwd_file); d89 11 a99 6 GetOptions( "twiki=s" => \$twiki, "url=s" => \$twiki, "web=s" => \$web, "verbose" => \$verbose, "passwd_file=s" => \$passwd_file, ); d111 1 a111 1 "$twiki/bin/search/$web/?search=.%2A®ex=on&format=|$marker|%3Cnop%3E\$topic|\$isodate|&nosearch=on&nosummary=on¬otal=on"; d114 1 a114 1 my $ua = RequestAgent->new($passwd_file); d143 2 a144 2 print "$web.$topic\n" if $verbose; $req->uri("$twiki/bin/attach/$web/$topic"); d148 2 a149 2 for $attachment ($attachpage =~ m{^\1}mg ) { $currentattachments{"$topic/$attachment"} = "$twiki/pub/$web/$topic/$attachment"; d171 1 a171 1 print "Not modified: $file\n" if $verbose; d197 104 a300 1 # vim:fdm=marker:commentstring=#\ %s: @ 1.1 log @Initial Version @ text @d15 1 d20 1 d29 1 d32 35 d73 1 a73 1 if (exists $self->{'basic_authentication'}{$netloc}{$realm}) { d75 1 a75 1 return @@{ $self->{'basic_authentication'}{$netloc}{$realm} }; d94 3 a96 1 my ($twiki,$web); d102 1 d118 1 a118 1 my $ua = RequestAgent->new; @