#!/usr/bin/perl use warnings; use strict; $|=1; my ($opt_v, $opt_d,); use Getopt::Long qw(:config no_ignore_case bundling); GetOptions( 'd' => \$opt_d, # dont load defaults 'v=i' => \$opt_v, # verbosity ); my $DEFAULT = { verbose => $opt_v, todo => {}, seen => {}, steps => 0, spool => "spool", timeout => 180, locktime => 300, keep_alive => 10, }; my $STATE = { %$DEFAULT }; eval { require Sys::Hostname; my $hn = Sys::Hostname->hostname(); $hn =~ s/\..*$//; $STATE->{hostname} ||= $hn; }; eval { use TMA::Config; $STATE->{memcached_loot_servers} ||= $TMA::Config::memcached_loot_servers; }; my @STACK = (); push @STACK, "load pillage-defaults.txt" unless $opt_d; push @STACK, @ARGV; while (@STACK) { my $cmd = shift @STACK; chomp $cmd; $cmd =~ s/\s+/ /g; $cmd =~ s/(^ | $)//g; next unless length $cmd; next if $cmd =~ /^#/; my $ll = 5; my $lm = "cmd '%s' ..."; my @lp = ($cmd,); if ($cmd =~ /^((?:https?|ftp):\S+)$/) { my $url = $1; $lm .= " is_url ..."; $STATE->{todo} ||= {}; if ($STATE->{todo}->{$url}) { $lm .= " redundant_todo %i ..."; push @lp, $STATE->{todo}->{$url}; } else { $lm .= " new_todo ..."; } $STATE->{todo}->{$1} = { source => 'user', depth => 0, }; } elsif ($cmd =~ /^load(?:[\s:]+)(\S+)$/) { my ($file,) = ($1,); $lm .= " loadfile(%s) ..."; push @lp, $file; open LF, "<", $file or die "cant open '$file': $!"; my @loaded = ; close LF or die "cant close readonly '$file': $!"; unshift @STACK, @loaded; $lm .= " loaded %i items ..."; push @lp, scalar(@loaded); } elsif ($cmd =~ /^(\w+)\s*=\s*(\S+)$/) { my ($key, $val,) = ($1, $2,); $lm .= " is_variable ..."; if (exists $STATE->{$key}) { $lm .= " replacing '%s' ..."; push @lp, (defined $STATE->{$key} ? $STATE->{$key} : ""); } $STATE->{$key} = $val; } elsif ($cmd =~ /^reset$/) { $lm .= " is_reset ..."; $STATE = { %$DEFAULT }; } elsif ($cmd =~ /^(\w+)(?: (.+))?$/ && eval "\\&cmd_$1") { my ($cmd, $arg,) = ($1, $2,); my $code = eval "\\&cmd_$1"; $lm .= " is_cmd '%s' ... code %s ..."; push @lp, $cmd, $code; &{$code}($arg); } else { die "UNKNOWN STACKITEM: '$cmd'"; } if (!@STACK && scalar keys %{$STATE->{todo}}) { push @STACK, "recurse_get"; } &log($ll,$lm,@lp); } if ($STATE->{stats}) { my $s = $STATE->{stats}; for (sort keys %$s) { printf "[%i] STATS: '%s' => %s\n", $$, $_, $s->{$_}; } exit 0; } die "never here"; ##### sub cmd_dump { require Data::Dumper; print "DUMPING:\n"; print Data::Dumper->Dump([$STATE, \@STACK],["STATE", "STACK",],); } sub cmd_crashdump { &cmd_dump(); die "requested crashdumpr"; } sub log { my $lev = shift; return unless $lev <= ($STATE->{verbose}||2); my $patt = shift; chomp $patt; printf "[%i] LOG(%i): $patt\n", $$, $lev, @_; } sub cmd_randua { delete $STATE->{ua}; my $uafile = $STATE->{ualist}; unless ($uafile) { $uafile = "ua.txt"; &log(1,"falling back to hardcoded useragent list '%s'", $uafile); } my $ua; if (-e $uafile) { open UA, "<", $uafile or die "cant open '$uafile': $!"; my @ua = ; close UA; my $pick = int(rand(scalar(@ua))); $ua = $ua[$pick]; $ua =~ s/[\s\r\n]*$//g; &log(4,"loaded %i agents from '%s', picked %i", scalar @ua, $uafile, $pick); } else { die "useragent file '$uafile' doesnt exist"; } $STATE->{ua} = $ua; } sub _get_agent { delete $STATE->{agent} unless ($STATE->{agentpid}||-1) == $$; return $STATE->{agent} if $STATE->{agent}; use LWPx::ParanoidAgent; my $ag = LWPx::ParanoidAgent->new( keep_alive => $STATE->{keep_alive}, timeout => $STATE->{timeout}, agent => $STATE->{ua}, ); # if ($STATE->{cookies}) { # my @jar = (); # for (@{$STATE->{cookies}}) { # push @jar, $_; # } # $ag-> &log(3, "initialized new agent %s", $ag); $STATE->{agent} = $ag; $STATE->{agentpid} = $$; return $ag; } sub _get_cache { delete $STATE->{cache} unless ($STATE->{cachepid}||-1) == $$; return $STATE->{cache} if $STATE->{cache}; my @par = ( @_, ) || ( servers => $STATE->{memcached_loot_servers}, ); require Cache::Memcached; my $cc = new Cache::Memcached @par; die "failed to create cacheclient" unless $cc; &log(3, "initialized new cache %s", $cc); $STATE->{cache} = $cc; $STATE->{cachepid} = $$; return $cc; } sub cmd_recurse_get { die "nothing todo" unless %{$STATE->{todo}}; my @todo = (); my $todo = {}; while (@todo || %{$STATE->{todo}}) { if ($STATE->{forkfor} && !@todo) { my $expr = $STATE->{forkfor}; my @forks = grep {/$expr/} keys %{$STATE->{todo}}; if (@forks) { &log(1,"FORK(%s): %i targets", $expr, scalar @forks); &do_fork(@forks); return 0; } } unless (@todo) { die "todo mismatch" if %$todo; $todo = $STATE->{todo}; @todo = sort keys %$todo; if (($STATE->{sort}||"") eq "filenumeric") { @todo = sort { my $A = ($a =~ /\/\D*(\d+)[^\/]*$/ ? $1 : undef); my $B = ($b =~ /\/\D*(\d+)[^\/]*$/ ? $1 : undef); if (defined $A && defined $B) { return $A <=> $B; } else { return $a cmp $b; } } @todo; } $STATE->{todo} = {}; &log(3, "TODO %i targets", scalar(@todo)); } die "nothing todo" unless @todo; my $u = shift @todo; my $ctx = delete $todo->{$u}; if ($STATE->{seen}->{$u}) { if ($STATE->{depth}) { next; } else { die "todo and seen?! '$u'"; } } my ($U, $CT, $D,) = &get_one($u, $ctx); if (defined $CT && $CT eq 'text/html') { &handle_html($U, $D, $ctx); } if (%{$STATE->{todo}} && $STATE->{depth}) { for (keys %$todo) { delete $STATE->{todo}->{$_}; } if (%{$STATE->{todo}}) { &cmd_recurse_get(); } } } die "not done hash todo" if %$todo; die "not done list todo" if @todo; } sub step { $STATE->{step}++; if ($STATE->{dumpstep} && !($STATE->{step} % $STATE->{dumpstep})) { &cmd_dump(); } } sub do_fork { delete $STATE->{forkfor}; delete $STATE->{agent}; @STACK = (); # block fork targets from being re-run for my $trg (@_) { if ($STATE->{seen}->{$trg}) { die "fork and seen: $trg"; } die "trg $trg not in todo" unless delete $STATE->{todo}->{$trg}; $STATE->{seen}->{$trg}++; } my %child = (); # fork one to continue current list my $c = fork(); die "fork failed" unless defined $c; if ($c) { &log(2,"FORKED(%i): continue current todo", $c); $child{$c} = '_current'; $STATE->{stats}->{fork}++; } else { &log(2,"CHILD(%i): continue current todo", $$); &cmd_recurse_get(); return 0; } for my $trg (@_) { $STATE->{todo} = {}; $STATE->{todo}->{$trg} = { source => 'fork', }; sleep 1; my $c = fork(); die "fork failed" unless defined $c; if ($c) { &log(2,"FORKED(%i): do '%s'", $c, $trg); $child{$c} = $trg; $STATE->{stats}->{fork}++; } else { &log(2,"CHILD(%i): working on '%s'", $$, $trg); delete $STATE->{seen}->{$trg}; &cmd_recurse_get(); return 0; } } # wait for all children to finish; while (keys %child) { my $cc = scalar keys %child; &log(2, "MASTER(%i): waiting on %i children", $$, $cc); my $d = wait(); my $cr = delete $child{$d}; die "unexpected child $d died" unless $cr; &log(2, "MASTER(%i): child %i (%s) died", $$, $d, $cr); if ($STATE->{childmin}) { if ($STATE->{childmin} >= scalar keys %child) { &log(2, "MASTER(%i): giving up on %i remaining children", $$, scalar keys %child); last; } } } } my $lastcoalesce; package LootOne; use base qw( TheSchwartz::Worker ); sub work { my $class = shift; use TheSchwartz; my TheSchwartz::Job $job = shift; my $arg = $job->arg; # use Data::Dumper; # print Dumper($arg); die "no arg" unless $arg; die "arg not ref $arg" unless ref $arg; die "arg not hashref $arg" unless ref $arg eq 'HASH'; my $url = $arg->{url}; die "no url" unless $url; $url =~ s/\s.*$//g; my $ctx = { source => "schwartz", referer => $arg->{ref}, }; main::get_one($url, $ctx,); my $coa = $job->coalesce; if ($coa) { $lastcoalesce = $coa; } $job->completed(); } package main; sub cmd_schwartz { use TheSchwartz; use TMA::Config; my $client = TheSchwartz->new( databases => $TMA::Config::schwartz_dbs, # verbose => 1, ); $client->can_do('LootOne'); while (1) { if ($lastcoalesce) { my $cc = &_get_cache(); if ($STATE->{locktime}) { unless ($cc->add("coalock:".$lastcoalesce, $$, $STATE->{locktime})) { $lastcoalesce = undef; next; } } printf "COALESCE(%s): ", $lastcoalesce; my @jobs = $client->list_jobs({ funcname => "LootOne", coalesce => $lastcoalesce, }); printf "got %i jobs ...\n", scalar @jobs; my $c = 0; for my $j (@jobs) { $cc->set("coalock:".$lastcoalesce, $$, $STATE->{locktime}); if ($client->grab_and_work_on($j->handle->as_string)) { $c++; } } $cc->delete("coalock:".$lastcoalesce); unless ($c) { $lastcoalesce = undef; } } else { $client->work_once(); } } die "never here"; } sub get_one ($$) { my ($u, $ctx,) = @_; &step(); my @args = (); my $cc = &_get_cache(); my $l2n = $cc->get("l2n:".$u); if ($l2n && $l2n->{type}) { if ($l2n->{type} =~ /^(image\/|application\/pdf$)/i) { my $sha1 = $l2n->{sha1}; if ($sha1) { use MIME::Base32 qw( RFC ); my $urn = sprintf "urn:sha1:%s", lc MIME::Base32::encode($sha1); &log(5,"L2N(%s): skip, have %s %s", $u, $l2n->{type}, $urn); return 0; } } if ($l2n->{lmod} && ($ctx->{source} ne "user")) { push @args, "If-Modified-Since", $l2n->{lmod}; } } if ($STATE->{locktime}) { if ($cc->add("lock:".$u, $$, $STATE->{locktime})) { # } else { &log(5,"LOCK(%s): skipped", $u); return 0; } } use TMA::Util; my $dt = TMA::Util::to_iso(); my $log = sprintf "GET %s ...", $u; my $ua = &_get_agent(); TRYGET: my $r = $ua->get($u, @args); my $code = $r->code; $STATE->{stats}->{"code $code"}++; #die "failed($u): ".$r->status_line unless $r->is_success; $STATE->{seen}->{$u}++; unless ($r->is_success) { $log .= sprintf " %s ...", $r->status_line; if (!$l2n && $code == 404) { my $n2l = { code => $code, when => $dt, }; $cc->set("l2n:".$u, $n2l); $log .= " setting cache"; } &log(2,$log); return 0; } $log .= sprintf " %i ...", $code; my $d = $r->content; $log .= sprintf " %i byte ...", length($d); my $len = $r->header("Content-Length"); if ($len) { unless ($len == length($d)) { $log .= sprintf " size mismatch, expected %i byte ...", $len; if ($STATE->{seen}->{$u} == 1) { $log .= " retrying ..."; goto TRYGET; } else { $log .= "giving up ..."; &log(2,$log); return 0; } } } $STATE->{stats}->{"content bytes"} += length($d); my $ct = lc $r->content_type(); $log .= sprintf " c/t '%s' ...", $ct; $STATE->{stats}->{"content type $ct"}++; $STATE->{stats}->{"content type $ct bytes"} += length($d); my @lm = $r->header('Last-Modified'); my ($lm, $lu) = (undef, 0,); for (@lm) { my $u = TMA::Util::to_unix($_); if ($u > $lu) { $lu = $u; $lm = $_; } } my $sha1 = sha1($d); use MIME::Base32 qw( RFC ); my $urn = sprintf "urn:sha1:%s", lc MIME::Base32::encode($sha1); $log .= sprintf " %s ...", $urn; my $n2l = { code => $code, size => length($d), sha1 => $sha1, when => $dt, type => $ct, lmod => $lm, }; $cc->set("l2n:".$u, $n2l); if ($lm) { use TMA::Util; $n2l->{mtime} = TMA::Util::to_iso($lm); } $n2l->{url} = $u; $n2l->{data} = $d; my $h = &spool($n2l); # &meta_add($n2l); # for ($r->header_field_names) { # printf "HEADER - %s: %s\n", # $_, $r->header($_); # } # exit 23; $log =~ s/%/%%/g; &log(2,$log); return ($u, $ct, $d,); } sub handle_html ($$) { my ($u, $d, $ctx,) = @_; my $log = "handle_html"; use Encode; my $hd = decode_utf8($d) || $d; use HTML::TreeBuilder; my $tree = HTML::TreeBuilder->new_from_content($hd); die "no tree" unless $tree; my $pu = $u; $pu =~ s,(/[^/]+/)[^/]+$,$1,; #printf " '%s' pref ...", $pu; my $qpu = quotemeta $pu; if ($STATE->{prefix}) { $qpu = $STATE->{prefix}; } my @links = @{ $tree->extract_links() }; $log .= sprintf " %i links ... ", scalar @links; my $depth = $ctx->{depth}||0; my %skips = (); my %links = (); for (@links) { my ($link, $element, $attr, $tag,) = @$_; use URI; my $lc = $link; $lc =~ s/#.*//g; # printf "\n'%s' '%s'\n", $lc, $u; my $uri = URI->new_abs($lc, $u); #my $us = $uri->as_string; my $us = $uri->canonical; my $skip = ""; my $e = $element->is_empty; if ($STATE->{maxdepth} && $STATE->{maxdepth} <= $depth) { $skip = "depth"; } elsif ($STATE->{skipempty} && $tag eq "a" && $element->is_empty) { $skip = "empty"; } elsif ($STATE->{seen}->{$us}) { $skip = "seen"; } elsif ($STATE->{blacklist} && $us =~ /$STATE->{blacklist}/i) { $skip = "black"; } elsif ($us !~ /^$qpu/i) { $skip = "prefix"; } if ($skip) { $skips{$skip}++; &log(13,"URLSKIP(%s): '%s'", $skip, $us); next; } $links{$us}++; } for(sort keys %skips) { $log .= sprintf " %i %s ...", $skips{$_}, $_; delete $skips{$_}; } $tree->delete; for (sort keys %links) { # next unless m#/\d+\.\w+$#; if ($STATE->{todo}->{$_}) { if ($STATE->{todo}->{$_}->{depth} > $depth+1) { $STATE->{todo}->{$_}->{depth} = $depth+1; } $skips{have}++; } $STATE->{todo}->{$_} = { source => "recurse", referer => $u, depth => $depth+1, }; $skips{todo}++; # printf " %4ix %s\n", $links{$_}, $_; } for(sort keys %skips) { $log .= sprintf " %i %s ...", $skips{$_}, $_; delete $skips{$_}; } $log =~ s/%/%%/g; &log(3,$log); } my $idcnt; sub spool { my $d = shift; die "not ref" unless ref $d; if (ref $d) { require Storable; $d = Storable::nfreeze($d); } use Digest::SHA qw/ sha1 /; my $h = sha1($d); my $D = $h.$d; my $now = time; die "no hostname" unless $STATE->{hostname}; $idcnt++; my $id = sprintf "%i.%s.%i.%i",$now,$STATE->{hostname},$$,$idcnt; my $pref = sprintf "%02i", ($now % 100); mkdir $STATE->{spool} unless -d $STATE->{spool}; die "no spool: ".$STATE->{spool} unless -d $STATE->{spool}; mkdir $STATE->{spool}."/".$pref unless -d $STATE->{spool}."/".$pref; die "no spoolpref: ".$STATE->{spool}."/".$pref unless -d $STATE->{spool}."/".$pref; my $fn = sprintf "%s/%s/%s", $STATE->{spool}, $pref, $id; my $tn = sprintf "%s/%s/.%s.tmp", $STATE->{spool}, $pref, $id; die "file exists: $fn" if -e $fn; die "file exists: $tn" if -e $tn; open TF, ">", $tn or die "cant open $tn: $!"; print TF $D; close TF or die "cant close $tn: $!"; rename $tn, $fn or die "cant rename($tn,$fn): $!"; die "still present $tn" if -e $tn; die "not present $fn" unless -e $fn; use File::stat; my $s = stat($fn); die "stat($fn) failed: $!" unless $s; die "size mismatch" unless $s->size == length($D); if ($STATE->{readback}) { open RF, "<", $fn or die "cant readopen($fn): $!"; local $/; my $d = ; close RF; die "data mismatch" unless $d eq $D; } return $fn; }