root/honeyclient/branches/bug/42/lib/HoneyClient/Agent/Driver/Browser/FF.pm

Revision 96, 42.0 kB (checked in by kindlund, 2 years ago)

Completed registry parser documentation and unit tests; corrected minor mispellings; updated POD documentation to reflect public website.

  • Property svn:keywords set to Id "$file"
Line 
1 ###############################################################################
2 # Created on:  May 11, 2006
3 # Package:     HoneyClient::Agent::Driver::FF
4 # File:        FF.pm
5 # Description: A specific driver for automating an instance of
6 #              the Firefox browser, running inside a
7 #              HoneyClient VM.
8 #
9 # @author knwang, xkovah, kindlund, ttruong
10 #
11 # Copyright (C) 2006 The MITRE Corporation.  All rights reserved.
12 #
13 # This program is free software; you can redistribute it and/or
14 # modify it under the terms of the GNU General Public License
15 # as published by the Free Software Foundation, using version 2
16 # of the License.
17 #
18 # This program is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 # GNU General Public License for more details.
22 #
23 # You should have received a copy of the GNU General Public License
24 # along with this program; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 # 02110-1301, USA.
27 #
28 ###############################################################################
29
30 package HoneyClient::Agent::Driver::FF;
31
32 # XXX: Disabled version check, Honeywall does not have Perl v5.8 installed.
33 #use 5.008006;
34 use strict;
35 use warnings;
36 use Carp ();
37 use Config;
38 use Win32::Job;          #For starting browser
39 use HTML::LinkExtor;     #For extracting links from HTML
40 use HTML::HeadParser;    #For extracting the meta w/ URL that LinkExtor misses
41 use LWP::UserAgent;      #Perl-based "browser"
42 use URI;                 #For absolutizing relative URLs
43 #use Data::Dumper;       #For Debugging
44
45 # Traps signals, allowing END: blocks to perform cleanup.
46 use sigtrap qw(die untrapped normal-signals error-signals);
47
48 ###############################################################################
49 # Module Initialization                                                       #
50 ###############################################################################
51
52 BEGIN {
53
54     # Defines which functions can be called externally.
55     require Exporter;
56     our ( @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS, $VERSION );
57
58     # Set our package version.
59     $VERSION = 0.9;
60
61     # Define inherited modules.
62     use HoneyClient::Agent::Driver;
63
64     @ISA = qw(Exporter HoneyClient::Agent::Driver);
65
66     # Symbols to export on request
67     # Note: Since this module is object-oriented, we do *NOT* export
68     # any functions other than "new" to call statically.  Each function
69     # for this module *must* be called as a method from a unique
70     # object instance.
71     @EXPORT = qw();
72
73     # Items to export into callers namespace by default. Note: do not export
74     # names by default without a very good reason. Use EXPORT_OK instead.
75     # Do not simply export all your public functions/methods/constants.
76
77     # This allows declaration use HoneyClient::Agent::Driver::FF ':all';
78     # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
79     # will save memory.
80
81     # Note: Since this module is object-oriented, we do *NOT* export
82     # any functions other than "new" to call statically.  Each function
83     # for this module *must* be called as a method from a unique
84     # object instance.
85     %EXPORT_TAGS = ( 'all' => [qw()], );
86
87     # Symbols to autoexport (:DEFAULT tag)
88     @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
89
90     $SIG{PIPE} = 'IGNORE';    # Do not exit on broken pipes.
91 }
92 our ( @EXPORT_OK, $VERSION );
93
94 ###############################################################################
95
96 # Include the Global Configuration Processing Library
97 use HoneyClient::Util::Config qw(getVar);
98
99 # Use ISO 8601 DateTime Libraries
100 use DateTime::HiRes;
101
102 # Use Storable Library
103 use Storable qw(dclone);
104
105 my %PARAMS = (
106
107     # This is a hashtable of fully qualified URLs
108     # to visit by the browser.  Specifically, the 'key' is
109     # the absolute URL and the 'value' is always 1.
110     links_to_visit => {},
111
112     # This is a hashtable of fully qualified URLs that the
113     # browser has already visited.  Specifically, the
114     # 'key' is the absolute URL and the 'value' is a string
115     # representing the date and time of when the link was visited.
116     #
117     # Note: See _getTimestamp() for the corresponding date/time
118     # format.
119     links_visited => {},
120
121     # This is a hashtable of URLs that the browser has found
122     # during its traversal process, but the browser could not
123     # access the link.
124     #
125     # Links could be added to this list if access requires any type of
126     # authentication, or if the link points to a non-HTTP or HTTPS
127     # resource (i.e., "javascript:doNetDetect()").
128     #
129     # The 'key' is the absolute URL and the 'value' is a string
130     # representing the date and time of when the link was visited.
131     #
132     # Note: See _getTimestamp() for the corresponding date/time
133     # format.
134     links_ignored => {},
135
136     # This is a hashtable of fully qualified URLs
137     # that all share a common *hostname*.  This hashtable should be
138     # initially empty.  As the driver extracts and removes new URLs
139     # off the 'links_to_visit' hashtable, driving the browser to each URL,
140     # any *relative* links found are added into this hashtable; any
141     # *external* links found are added back into the 'links_to_visit'
142     # hashtable.
143     #
144     # When navigating to the next link, this hashtable is exhausted prior
145     # to the main 'links_to_visit' hashtable.  This allows a
146     # browser to navigate to all links hosted on the same server, prior
147     # to contacting a different server.
148     #
149     # Specifically, the 'key' is the absolute URL and the 'value'
150     # is always 1.
151     relative_links_to_visit => {},
152
153     # This is a scalar that contains the next URL to visit.
154     # It is updated dynamically, any time _dep_getNextLink() is called.
155     # When the browser is ready to drive to the next link,
156     # 'next_link_to_visit' is checked.  If that value is undef, then
157     # the 'relative_links_to_visit' hashtable is checked next.
158     # If that hashtable is empty, then finally the 'links_to_visit'
159     # hashtable is checked.
160     next_link_to_visit => undef,
161
162     # This is a hashtable of URLs that the browser has found
163     # during its traversal process, but the browser could not
164     # access the resource due to the operation timing out.
165     #
166     # The 'key' is the absolute URL and the 'value' is a string
167     # representing the date and time of when the link was visited.
168     #
169     # Note: See _getTimestamp() for the corresponding date/time
170     # format.
171     links_timed_out => {},
172
173     # Absolute path to firefox executable. Needs to be in Windows
174     # path format not Cygwin, because it's used in Win32::Job
175     ff_exec => getVar(name => "ff_exec"),
176
177     # Hashtable used to hold hostnames and ports for any other sites which may
178     # need to be looked up (for instance external image URLs)
179     # Entries are in the form
180     # {hostname => port, ...}
181     hosts_to_resolve => {},
182
183     # A switch which can be turned off (set to 0) to decide NOT to drive ff
184     # to a particularly instance of a URL. For instance because the URL in
185     # question actually redirects somewhere else and you don't want to get
186     # a false positive on the firewall.
187     drive_ff => 0,
188
189     # Because the FF.pm drive() function actually uses the perl LWP::UserAgent
190     # to actually process a page and find links, there are actually two things
191     # going on inside this function. Because of the interaction between next()
192     # and drive() it is therefore necessary to pop back out of the drive()
193     # after it's got the links but before it drives ff, so that the data can
194     # be used by next() and given back to the firewall before it drives ff.
195     # This toggle lets every other instance of drive() either use the
196     # LWP::UserAgent or Firefox.
197     flip_flop => 0,
198
199     # An integer, representing how many relative links the browser
200     # should continue to drive to, before moving onto another
201     # website.  If negative, then the browser will exhaust all possible
202     # relative links, before moving on.  (This internal variable should
203     # never be modified externally.)
204     max_relative_links_to_visit => getVar(name => "max_relative_links_to_visit"),
205
206     # NOTE: Don't delete me unless you delete max_relative_links_to_visit
207     # An integer, representing the maximum number of relative links that
208     # the browser should visit, before moving onto another website.  If
209     # negative, then the browser will exhaust all possible relative links
210     # found, before moving on.  This functionality is best effort; it's
211     # possible for the browser to visit new links on previously visited
212     # websites.
213     _remaining_number_of_relative_links_to_visit => getVar(name => "max_relative_links_to_visit"),
214    
215     #COPY DARIEN'S COMMENTS! YAY! (or not, boo!))
216     _next_connections => {},
217
218     #This is here so that it doesn't have to do a getVar every time it wants to run
219     # the LWP stuff. Can maybe be moved into the big datastructure.
220     http_proxy => getVar(name => "http_proxy"),
221
222 );
223
224 ###############################################################################
225 # Private Methods Implemented                                                 #
226 ###############################################################################
227
228 # Helper function designed to retrieve the next link for the browser
229 # to navigate to.
230 #
231 # Note: Calling this function will implicitly remove the next link from
232 #       any and all applicable hashtables/scalars.
233 #
234 # When getting the next link, 'next_link_to_visit' is checked first.
235 # If that value is undef, then the 'relative_links_to_visit' hashtable
236 # is checked next.  If that hashtable is empty, then finally the
237 # 'links_to_visit' hashtable is checked.
238 #
239 # Inputs: HoneyClient::Agent::Driver::FF object
240 # Outputs: link, or undef if all applicable scalars/hashtables are empty
241 sub _getNextLink {
242
243     # Get the object state.
244     my $self = shift;
245
246     # Set the link to find as undef, initially.
247     my $link = undef;
248
249     while ( !defined($link) or ($link eq "") ) {
250
251         # Try getting the next link from the next link
252         # scalar.
253         $link = $self->next_link_to_visit;
254         $self->{next_link_to_visit} = undef;
255
256         # If the next link scalar is empty, try
257         # getting a link from the relative hashtable.
258         unless ( defined($link) ) {
259             $link = _pop( $self->relative_links_to_visit );
260         }
261
262         # If the relative hashtable is empty, try getting one
263         # from the external hashtable.
264         unless ( defined($link) ) {
265             $link = _pop( $self->links_to_visit );
266         }
267
268         # If all hashtables/scalars were empty, immediately return an
269         # undef value.
270         unless ( defined($link) ) {
271             return $link;
272         }
273
274         # Now, make sure the link is valid, before we return
275         # it; if it's not valid, we simply move on to the next
276         # one in our hashtables.
277         #$link = $self->_validateLink($link);
278     }
279
280     # Return the next link found.
281     return $link;
282 }
283 ###############################################################################
284
285 # Helper function designed to get a current timestamp from
286 # the system OS.
287 #
288 # Note: This timestamp is in ISO 8601 format.
289 #
290 # Inputs: none
291 # Outputs: timestamp
292 sub _getTimestamp {
293     my $dt = DateTime::HiRes->now();
294     return $dt . "." . $dt->nanosecond();
295 }
296 ###############################################################################
297 # Helper function designed to "pop" a key off a given hashtable.
298 # When given a hashtable reference, this function will extract a valid key
299 # from the hashtable and delete the (key, value) pair from the
300 # hashtable.
301 #
302 # Note: There is no guaranteed order about how this function picks
303 # keys from the hashtable.
304 #
305 # Inputs: hashref
306 # Outputs: valid key, or undef if the hash is empty
307 sub _pop {
308
309     # Get supplied hash reference.
310     my $hash = shift;
311
312     # Get a new key.
313     my @keys = keys( %{$hash} );
314     my $key  = pop(@keys);
315
316     # Delete the key from the hashtable.
317     if ( defined($key) ) {
318         delete $hash->{$key};
319     }
320 ##  print "in pop " . Dumper($hash);
321
322     # Return the key found.
323     return $key;
324 }
325
326 ###############################################################################
327
328 # Helper function, designed to extract the hostname
329 # (and, if it exists, the port number) from a given
330 # URL.
331 #
332 # For example, if "http://hostname.com:80/path/index.html"
333 # is given, then "hostname:80" would be returned.
334 #
335 # Inputs: URL
336 # Outputs: hostname[:port]
337 sub _extractHostname {
338
339     # Sanity check.
340     my $arg = shift();
341
342     if (!defined($arg)) {
343         return "";
344     }
345
346     # Get the URL supplied.
347     my $url = $arg;
348
349     # Note: The '?' chars make a critical difference
350     # in how this regex operates.
351     $url =~ s/.*?\/\/(.*?)\/.*/$1/;
352
353     # Return the extracted hostname.
354     return $url;
355 }
356 ###############################################################################
357
358 # Helper function, designed to process all links found at a
359 # given URL, once the browser has been driven to that URL
360 # and has collected all corresponding links.
361 #
362 # When supplied with the array of URLs
363 # this function will categorize them
364 # as follows:
365 #
366 # "New" links are those we've never driven the browser to.
367 # "Old" links are those we've driven the browser to before.
368 #
369 # - If a link is new and "invalid", then it gets added to
370 #   the 'links_ignored' hashtable.
371 #
372 # - If a link is old and "invalid", then it gets
373 #   ignored.
374 #
375 # - If a link is old and "valid", then it gets ignored.
376 #
377 # - If a link is new and "valid", then we check to see if
378 #   the referring URL's hostname[:port] and the link's
379 #   hostname[:port] match.  If they match, then the link
380 #   is added to the 'relative_links_to_visit' hash.
381 #   Otherwise, the link is added to the 'links_to_visit'
382 #   hash.
383 #
384 # Inputs: HoneyClient::Agent::Driver::FF object,
385 #         referring URL,
386 #         array of LinkExtor link arrays
387 # Outputs: HoneyClient::Agent::Driver::FF object
388
389 sub _processLinks {
390
391     # Get the object state.
392     my $self = shift;
393
394     # Get the referrer and the corresponding array of links.
395     my ( $base_url, @links ) = @_;
396
397     # Sanity check: Make sure we've been fed an object.
398     unless ( ref($self) ) {
399         Carp::croak "Error: Function must be called in reference to a "
400           . __PACKAGE__
401           . "->new() object!\n";
402     }
403
404     foreach my $link (@links) {
405
406         #format of this array (pointed to by the $link ref) is
407         # like ("img", "src", "$url") or ("a", "href", "$url")
408
409         #skip some link types for now
410         if (   @{$link}[0] eq "form"
411             || @{$link}[0] eq "input"
412             || @{$link}[0] eq "script"
413             || @{$link}[0] eq "td"
414             || @{$link}[0] eq "table" )
415         {
416             next;
417         }
418
419     #Need to put these in a bucket so the firewall knows to allow access to them
420         if (   @{$link}[0] eq "img"
421             || @{$link}[0] eq "object"
422             || @{$link}[0] eq "embed" )
423         {
424
425             # lazy way, just absolutize right away incased it's a relative URL
426             # this should catch any time a base href is actually used
427             my $tmp_hn =
428               _extractHostname( URI->new_abs( "@{$link}[2]", "$base_url" ) );
429 ##          print "@{$link}[0] @{$link}[2] $tmp_hn\n";
430             my $tmp_port;
431
432     #if it already has a port appended use that, otherwise set it to the default
433             if ( $tmp_hn =~ /.*?:(.*)/ ) {
434                 $tmp_port = $1;
435             }
436             else {
437
438                 #Can you have image links to other protocols?
439                 #if(@{$link}[2] =~ /^ftp/) {$tmp_port = 21;}
440                 if ( @{$link}[2] =~ /^http/ )  { $tmp_port = 80; }
441                 if ( @{$link}[2] =~ /^https/ ) { $tmp_port = 443; }
442                 else { $tmp_port = 80; }
443             }
444             if(!exists($self->{hosts_to_resolve}{$tmp_hn})){
445                 $self->{hosts_to_resolve}{$tmp_hn} = $tmp_port;
446                 print "adding $tmp_hn to hosts_to_resolve bucket in _processLinks\n";
447             }
448            
449             next;
450
451         }
452
453         if (   @{$link}[0] ne "a"
454             && @{$link}[0] ne "link"
455             && @{$link}[0] ne "area"
456             && @{$link}[0] ne "iframe" )
457         {
458             print "new type of link!: @{$link}\n";
459             $self->links_ignored->{ @{$link}[2] } = _getTimestamp();
460             next;
461         }
462
463         #strip anchors
464         @{$link}[2] =~ s/\#.*//;
465
466         #trying to avoid links already in the buckets
467         if (   $self->_preexisting( @{$link}[2] )
468             || @{$link}[2]    eq $self->next_link_to_visit
469             || "@{$link}[2]/" eq $self->next_link_to_visit )
470         {
471             next;
472         }
473
474         #       print "@{$link}\n";
475
476        #starts with http(s)://
477        # Check to make sure it's not an absolute link to a site we're already on
478        # The second case of the if() is "the slashdot exception" ;)
479         if ( @{$link}[2] =~ /^http[s]?:\/\//i || @{$link}[2] =~ /^\/\//i ) {
480             my $tmp_url;
481             if ( _extractHostname( @{$link}[2] ) eq
482                 _extractHostname( $self->{next_link_to_visit} ) )
483             {
484                 # URI->new_abs() will never return undef; however,
485                 # it could die if invalid parameters are supplied.
486                 $tmp_url = URI->new_abs( "@{$link}[2]", "$base_url" );
487                 $self->relative_links_to_visit->{$tmp_url} = 1;
488                 next;
489             }
490             else {
491                 # URI->new_abs() will never return undef; however,
492                 # it could die if invalid parameters are supplied.
493                 $tmp_url = URI->new_abs( "@{$link}[2]", "$base_url" );
494                 $self->links_to_visit->{$tmp_url} = 1;
495                 next;
496             }
497         }
498
499         #ignore mailto and javascript links
500         if (   @{$link}[2] =~ /^mailto/
501             || @{$link}[2] =~ /;$/
502             || @{$link}[2] =~ /^javascript:/ )
503         {
504             $self->links_ignored->{ @{$link}[2] } = _getTimestamp();
505             next;
506         }
507
508        #ignore any other URI types for now since we've already got http(s) links
509         if ( @{$link}[2] =~ /^\w*:\/\// ) {
510             $self->links_ignored->{ @{$link}[2] } = _getTimestamp();
511             next;
512         }
513
514        #From here on out it should be a relative link, so first we absolutize it
515         # URI->new_abs() will never return undef; however,
516         # it could die if invalid parameters are supplied.
517         my $uri = URI->new_abs( "@{$link}[2]", "$base_url" );
518
519       # Then we check to make sure the absolute link isn't a variant of the page
520       # we got it from, and not already visited
521         if (   $uri ne $base_url
522             && $uri    ne "$base_url/"
523             && $uri    ne $self->next_link_to_visit
524             && "$uri/" ne $self->next_link_to_visit )
525         {
526
527             if ( !$self->_preexisting($uri) ) {
528
529 #               print "2 adding (formerly) relative link $uri to the relative links to visit\n";
530                 $self->relative_links_to_visit->{$uri} = 1;
531             }
532         }
533
534     }
535
536     # Return the modified object state.
537     return $self;
538 }
539
540 ###############################################################################
541
542 # returns 1 if a URI exists somewhere in the buckets
543 # otherwise returns 0
544
545 # All the commented out code is related to whether we should test for the
546 # existence of a link that is the same as the current link except possibly
547 # having or missing a trailing slash.
548 # Currently that code is not used, and therefore it will treat
549 # http://www.foo.com/bar
550 # as a different link than
551 # http://www.foo.com/bar/
552 # And therefore go to both link independantly
553 # Comments on whether this should be the case or not are greatly appreciated
554
555 sub _preexisting {
556     my $self = shift;
557     my $uri  = shift;
558
559     #       if($uri !~ /\/$/){
560     #try as is only (currently...and with trailing slash maybe later)
561     if (
562         exists( $self->links_to_visit->{$uri} )
563         ||
564
565         #                   $self->links_to_visit->{"$uri/"} ||
566         exists( $self->relative_links_to_visit->{$uri} ) ||
567
568         #                   $self->relative_links_to_visit->{"$uri/"} ||
569         exists( $self->links_ignored->{$uri} ) ||
570
571         #                   $self->links_ignored->{"$uri/"} ||
572         exists( $self->links_visited->{$uri} )
573       )
574     {
575
576         #                   $self->links_visited->{"$uri/"} ){
577         return 1;
578     }
579
580     #       }
581     #       else{
582     #try without a trailing slash
583     #           $uri =~ s/\/$//;
584     #           if(    $self->links_to_visit->{$uri} ||
585     #                  $self->relative_links_to_visit->{$uri} ||
586     #                  $self->links_ignored->{$uri} ||
587     #                  $self->links_visited->{$uri} ){
588     #               return 1;
589     #           }
590     #       }
591
592     return 0;
593 }
594
595 ###############################################################################
596 # Public Methods Implemented                                                  #
597 ###############################################################################
598
599 =pod
600
601 =head1 METHODS IMPLEMENTED
602
603 The following functions have been implemented by the FF driver.  Many
604 of these methods were implementations of the parent Driver interface.
605
606 As such, the following code descriptions pertain to this particular
607 Driver implementation.  For further information about the generic
608 Driver interface, see the L<HoneyClient::Agent::Driver> documentation.
609
610 =head2 HoneyClient::Agent::Driver::FF->new($param => $value, ...)
611
612 =over 4
613
614 Creates a new FF driver object, which contains a hashtable
615 containing any of the supplied "param => value" arguments.
616
617 I<Inputs>:
618  B<$param> is an optional parameter variable.
619  B<$value> is $param's corresponding value.
620  
621 Note: If any $param(s) are supplied, then an equal number of
622 corresponding $value(s) B<must> also be specified.
623
624 I<Output>: The instantiated FF driver B<$object>, fully initialized.
625
626 =back
627
628 =begin testing
629
630 # XXX: Add this.
631 1;
632
633 =end testing
634
635 =cut
636
637 sub new {
638
639     # - This function takes in an optional hashtable,
640     #   that contains various key => 'value' configuration
641     #   parameters.
642     #
643     # - For each parameter given, it overwrites any corresponding
644     #   parameters specified within the default hashtable, %PARAMS,
645     #   with custom entries that were given as parameters.
646     #
647     # - Finally, it returns a blessed instance of the
648     #   merged hashtable, as an 'object'.
649
650     # Get the class name.
651     my $self = shift;
652
653     # Get the rest of the arguments, as a hashtable.
654     # Hash-based arguments are used, since HoneyClient::Util::SOAP is unable to handle
655     # hash references directly.  Thus, flat hashtables are used throughout the code
656     # for consistency.
657     my %args = @_;
658
659     # Check to see if the class name is inherited or defined.
660     my $class = ref($self) || $self;
661
662     # Initialize default parameters.
663     my %params = %{ dclone( \%PARAMS ) };
664     $self = $class->SUPER::new();
665     @{$self}{ keys %params } = values %params;
666
667     # Now, overwrite any default parameters that were redefined
668     # in the supplied arguments.
669     @{$self}{ keys %args } = values %args;
670
671     # Now, assign our object the appropriate namespace.
672     bless $self, $class;
673    
674     #bla
675     $self->{_remaining_number_of_relative_links_to_visit} =
676         $self->{max_relative_links_to_visit};
677
678     # Finally, return the blessed object.
679     return $self;
680 }
681
682 ###############################################################################
683
684 =pod
685
686 =head2 $object->drive()
687
688 =over 4
689
690 Drives an instance of Firefox for one iteration,
691 navigating to the next URL and updating the driver's corresponding
692 internal hashtables accordingly.
693
694 For a description of which hashtable is consulted upon each
695 iteration of drive(), see the L<next_link_to_visit> documentation, in
696 the "DEFAULT PARAMETER LIST" section.
697
698 Once a drive() iteration has completed, the corresponding Firefox browser
699 process is terminated.  Thus, each call to drive() invokes a new instance of
700 the browser.
701
702 I<Output>: The updated FF driver B<$object>, containing state information
703 from driving Firefox for one iteration.
704
705 B<Warning>: This method will B<croak> if the FF driver object is B<unable>
706 to navigate to a new link, because its list of links to visit is empty.
707
708 =back
709
710 =begin testing
711
712 # XXX: Test this.
713 1;
714
715 =end testing
716
717 =cut
718
719 sub drive {
720
721     # Get the object state.
722     my $self     = shift;
723     my $base_url = undef;
724
725     #   my $drive_ff = 1;   # To be set to one for some corner cases where
726     # we don't want the real browser to visit a site
727
728     # Sanity check: Make sure we've been fed an object.
729     unless ( ref($self) ) {
730         Carp::croak "Error: Function must be called in reference to a "
731           . __PACKAGE__
732           . "->new() object!\n";
733     }
734
735     # Get the next URL from our hashtables.
736     my $url = $self->_dep_getNextLink();
737
738     # Sanity check: Make sure our next URL is defined.
739     unless ( defined($url) ) {
740         Carp::croak "Error: Unable to drive browser - 'links_to_visit' "
741           . "hashtable is empty!\n";
742     }
743
744     ###########################
745     #Drive the LWP-based "browser"
746     ###########################
747     if ( $self->{flip_flop} == 0 ) {
748         print "LWP browsing to $url\n";
749         $self->{drive_ff} = 1;
750         my $ua = LWP::UserAgent->new( "keep_alive" => 300, max_redirect => 0,
751                             timeout => $self->{timeout});
752         if($self->{http_proxy} ne 'none'){
753                 $ua->proxy( 'http', $self->{http_proxy});
754         }
755        
756         #Set up the headers to mimic FF
757         #TODO: LWP::UserAgent headers are not *exactly* the same as Firefox still
758         # this is because if I turn off the TE header manually in the .pm file
759         # and if I set the 'Accept-Encoding' to 'gzip,deflate' (which is what is
760         # in the TE header too) it still doesn't process compressed info correctly
761         $ua->agent(
762 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.5'
763         );
764         $ua->default_header( 'Accept' =>
765 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
766         );
767         $ua->default_header( 'Accept-Language' => 'en-us,en;q=0.5' );
768         $ua->default_header(
769             'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7' );
770
771         #    $ua->default_header('Accept-Encoding' => 'gzip,deflate');
772
773         # get() returns an HTTP::Response object
774         # and the raw html will be in $data->content
775         my $data = $ua->get($url);
776         if ( !defined($data) ) {
777             print "got back a bad HTTP::Response object. Dying\n";
778             $self->{next_link_to_visit} = undef;
779             exit;
780         }
781
782         if ( $data->is_redirect ) {
783             print "GOT REDIRECTED trying to go to $url\n";
784             $self->links_visited->{$url} = _getTimestamp() . ":http_refresh";
785             $self->{drive_ff} = 0;
786         }
787
788         if ( $data->content() =~ /^[0-9][0-9][0-9]/ ) {
789
790             # In practice I have only seen an error message starting with 500
791             # to get to this case.(when the socket connect fails or your ctrl-c)
792             #  - Xeno
793             my $tmp = $data->content();
794             chomp($tmp);
795             print "Error: $tmp. ABORTING\n";
796             $self->{next_link_to_visit} = undef;
797             return -1;
798         }
799
800         #Before we give it to LinkExtor, check for any meta refresh URLs
801         my $p = HTML::HeadParser->new();
802         # TODO: Check to make sure the parser is functioning properly.
803         $p->parse( $data->content() );
804
805         #The HeadParser doesn't get us all the way there, still need regex
806 ###     print "HEADER\n" . Dumper($p->header) . "\n";
807         if ( defined( $p->header->{refresh} )
808             && $p->header->{refresh} =~ /.*URL=(.*)/i )
809         {
810             print "META REFRESH going to $1\n";
811
812             #Just pretend we went there
813             #Also absolutize just incase (since I've seen it needed)
814             # URI->new_abs() will never return undef; however,
815             # it could die if invalid parameters are supplied.
816             my $uri = URI->new_abs($1, $url);
817            
818             $self->links_to_visit->{$uri}  = _getTimestamp();
819             $self->links_visited->{$url} = _getTimestamp() . ":meta_refresh";
820
821            # TODO: Should *really* only not go if the refresh timer is less than
822            # our timeout...Currently it ignores the time alltogether