root/honeyclient/branches/bug/42/lib/HoneyClient/Agent/Driver/Browser/FF.pm
| Revision 96, 42.0 kB (checked in by kindlund, 2 years ago) | |
|---|---|
| |
| Line | |
|---|---|
| 1 | ############################################################################### |
| 2 | # Created on: May 11, 2006 |
| 3 | # Package: HoneyClient::Agent::Driver::FF |
| 4 | # File: FF.pm |
| 5 | # Description: A specific driver for automating an instance of |
| 6 | # the Firefox browser, running inside a |
| 7 | # HoneyClient VM. |
| 8 | # |
| 9 | # @author knwang, xkovah, kindlund, ttruong |
| 10 | # |
| 11 | # Copyright (C) 2006 The MITRE Corporation. All rights reserved. |
| 12 | # |
| 13 | # This program is free software; you can redistribute it and/or |
| 14 | # modify it under the terms of the GNU General Public License |
| 15 | # as published by the Free Software Foundation, using version 2 |
| 16 | # of the License. |
| 17 | # |
| 18 | # This program is distributed in the hope that it will be useful, |
| 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 21 | # GNU General Public License for more details. |
| 22 | # |
| 23 | # You should have received a copy of the GNU General Public License |
| 24 | # along with this program; if not, write to the Free Software |
| 25 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
| 26 | # 02110-1301, USA. |
| 27 | # |
| 28 | ############################################################################### |
| 29 | |
| 30 | package HoneyClient::Agent::Driver::FF; |
| 31 | |
| 32 | # XXX: Disabled version check, Honeywall does not have Perl v5.8 installed. |
| 33 | #use 5.008006; |
| 34 | use strict; |
| 35 | use warnings; |
| 36 | use Carp (); |
| 37 | use Config; |
| 38 | use Win32::Job; #For starting browser |
| 39 | use HTML::LinkExtor; #For extracting links from HTML |
| 40 | use HTML::HeadParser; #For extracting the meta w/ URL that LinkExtor misses |
| 41 | use LWP::UserAgent; #Perl-based "browser" |
| 42 | use URI; #For absolutizing relative URLs |
| 43 | #use Data::Dumper; #For Debugging |
| 44 | |
| 45 | # Traps signals, allowing END: blocks to perform cleanup. |
| 46 | use sigtrap qw(die untrapped normal-signals error-signals); |
| 47 | |
| 48 | ############################################################################### |
| 49 | # Module Initialization # |
| 50 | ############################################################################### |
| 51 | |
| 52 | BEGIN { |
| 53 | |
| 54 | # Defines which functions can be called externally. |
| 55 | require Exporter; |
| 56 | our ( @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS, $VERSION ); |
| 57 | |
| 58 | # Set our package version. |
| 59 | $VERSION = 0.9; |
| 60 | |
| 61 | # Define inherited modules. |
| 62 | use HoneyClient::Agent::Driver; |
| 63 | |
| 64 | @ISA = qw(Exporter HoneyClient::Agent::Driver); |
| 65 | |
| 66 | # Symbols to export on request |
| 67 | # Note: Since this module is object-oriented, we do *NOT* export |
| 68 | # any functions other than "new" to call statically. Each function |
| 69 | # for this module *must* be called as a method from a unique |
| 70 | # object instance. |
| 71 | @EXPORT = qw(); |
| 72 | |
| 73 | # Items to export into callers namespace by default. Note: do not export |
| 74 | # names by default without a very good reason. Use EXPORT_OK instead. |
| 75 | # Do not simply export all your public functions/methods/constants. |
| 76 | |
| 77 | # This allows declaration use HoneyClient::Agent::Driver::FF ':all'; |
| 78 | # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
| 79 | # will save memory. |
| 80 | |
| 81 | # Note: Since this module is object-oriented, we do *NOT* export |
| 82 | # any functions other than "new" to call statically. Each function |
| 83 | # for this module *must* be called as a method from a unique |
| 84 | # object instance. |
| 85 | %EXPORT_TAGS = ( 'all' => [qw()], ); |
| 86 | |
| 87 | # Symbols to autoexport (:DEFAULT tag) |
| 88 | @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
| 89 | |
| 90 | $SIG{PIPE} = 'IGNORE'; # Do not exit on broken pipes. |
| 91 | } |
| 92 | our ( @EXPORT_OK, $VERSION ); |
| 93 | |
| 94 | ############################################################################### |
| 95 | |
| 96 | # Include the Global Configuration Processing Library |
| 97 | use HoneyClient::Util::Config qw(getVar); |
| 98 | |
| 99 | # Use ISO 8601 DateTime Libraries |
| 100 | use DateTime::HiRes; |
| 101 | |
| 102 | # Use Storable Library |
| 103 | use Storable qw(dclone); |
| 104 | |
| 105 | my %PARAMS = ( |
| 106 | |
| 107 | # This is a hashtable of fully qualified URLs |
| 108 | # to visit by the browser. Specifically, the 'key' is |
| 109 | # the absolute URL and the 'value' is always 1. |
| 110 | links_to_visit => {}, |
| 111 | |
| 112 | # This is a hashtable of fully qualified URLs that the |
| 113 | # browser has already visited. Specifically, the |
| 114 | # 'key' is the absolute URL and the 'value' is a string |
| 115 | # representing the date and time of when the link was visited. |
| 116 | # |
| 117 | # Note: See _getTimestamp() for the corresponding date/time |
| 118 | # format. |
| 119 | links_visited => {}, |
| 120 | |
| 121 | # This is a hashtable of URLs that the browser has found |
| 122 | # during its traversal process, but the browser could not |
| 123 | # access the link. |
| 124 | # |
| 125 | # Links could be added to this list if access requires any type of |
| 126 | # authentication, or if the link points to a non-HTTP or HTTPS |
| 127 | # resource (i.e., "javascript:doNetDetect()"). |
| 128 | # |
| 129 | # The 'key' is the absolute URL and the 'value' is a string |
| 130 | # representing the date and time of when the link was visited. |
| 131 | # |
| 132 | # Note: See _getTimestamp() for the corresponding date/time |
| 133 | # format. |
| 134 | links_ignored => {}, |
| 135 | |
| 136 | # This is a hashtable of fully qualified URLs |
| 137 | # that all share a common *hostname*. This hashtable should be |
| 138 | # initially empty. As the driver extracts and removes new URLs |
| 139 | # off the 'links_to_visit' hashtable, driving the browser to each URL, |
| 140 | # any *relative* links found are added into this hashtable; any |
| 141 | # *external* links found are added back into the 'links_to_visit' |
| 142 | # hashtable. |
| 143 | # |
| 144 | # When navigating to the next link, this hashtable is exhausted prior |
| 145 | # to the main 'links_to_visit' hashtable. This allows a |
| 146 | # browser to navigate to all links hosted on the same server, prior |
| 147 | # to contacting a different server. |
| 148 | # |
| 149 | # Specifically, the 'key' is the absolute URL and the 'value' |
| 150 | # is always 1. |
| 151 | relative_links_to_visit => {}, |
| 152 | |
| 153 | # This is a scalar that contains the next URL to visit. |
| 154 | # It is updated dynamically, any time _dep_getNextLink() is called. |
| 155 | # When the browser is ready to drive to the next link, |
| 156 | # 'next_link_to_visit' is checked. If that value is undef, then |
| 157 | # the 'relative_links_to_visit' hashtable is checked next. |
| 158 | # If that hashtable is empty, then finally the 'links_to_visit' |
| 159 | # hashtable is checked. |
| 160 | next_link_to_visit => undef, |
| 161 | |
| 162 | # This is a hashtable of URLs that the browser has found |
| 163 | # during its traversal process, but the browser could not |
| 164 | # access the resource due to the operation timing out. |
| 165 | # |
| 166 | # The 'key' is the absolute URL and the 'value' is a string |
| 167 | # representing the date and time of when the link was visited. |
| 168 | # |
| 169 | # Note: See _getTimestamp() for the corresponding date/time |
| 170 | # format. |
| 171 | links_timed_out => {}, |
| 172 | |
| 173 | # Absolute path to firefox executable. Needs to be in Windows |
| 174 | # path format not Cygwin, because it's used in Win32::Job |
| 175 | ff_exec => getVar(name => "ff_exec"), |
| 176 | |
| 177 | # Hashtable used to hold hostnames and ports for any other sites which may |
| 178 | # need to be looked up (for instance external image URLs) |
| 179 | # Entries are in the form |
| 180 | # {hostname => port, ...} |
| 181 | hosts_to_resolve => {}, |
| 182 | |
| 183 | # A switch which can be turned off (set to 0) to decide NOT to drive ff |
| 184 | # to a particularly instance of a URL. For instance because the URL in |
| 185 | # question actually redirects somewhere else and you don't want to get |
| 186 | # a false positive on the firewall. |
| 187 | drive_ff => 0, |
| 188 | |
| 189 | # Because the FF.pm drive() function actually uses the perl LWP::UserAgent |
| 190 | # to actually process a page and find links, there are actually two things |
| 191 | # going on inside this function. Because of the interaction between next() |
| 192 | # and drive() it is therefore necessary to pop back out of the drive() |
| 193 | # after it's got the links but before it drives ff, so that the data can |
| 194 | # be used by next() and given back to the firewall before it drives ff. |
| 195 | # This toggle lets every other instance of drive() either use the |
| 196 | # LWP::UserAgent or Firefox. |
| 197 | flip_flop => 0, |
| 198 | |
| 199 | # An integer, representing how many relative links the browser |
| 200 | # should continue to drive to, before moving onto another |
| 201 | # website. If negative, then the browser will exhaust all possible |
| 202 | # relative links, before moving on. (This internal variable should |
| 203 | # never be modified externally.) |
| 204 | max_relative_links_to_visit => getVar(name => "max_relative_links_to_visit"), |
| 205 | |
| 206 | # NOTE: Don't delete me unless you delete max_relative_links_to_visit |
| 207 | # An integer, representing the maximum number of relative links that |
| 208 | # the browser should visit, before moving onto another website. If |
| 209 | # negative, then the browser will exhaust all possible relative links |
| 210 | # found, before moving on. This functionality is best effort; it's |
| 211 | # possible for the browser to visit new links on previously visited |
| 212 | # websites. |
| 213 | _remaining_number_of_relative_links_to_visit => getVar(name => "max_relative_links_to_visit"), |
| 214 | |
| 215 | #COPY DARIEN'S COMMENTS! YAY! (or not, boo!)) |
| 216 | _next_connections => {}, |
| 217 | |
| 218 | #This is here so that it doesn't have to do a getVar every time it wants to run |
| 219 | # the LWP stuff. Can maybe be moved into the big datastructure. |
| 220 | http_proxy => getVar(name => "http_proxy"), |
| 221 | |
| 222 | ); |
| 223 | |
| 224 | ############################################################################### |
| 225 | # Private Methods Implemented # |
| 226 | ############################################################################### |
| 227 | |
| 228 | # Helper function designed to retrieve the next link for the browser |
| 229 | # to navigate to. |
| 230 | # |
| 231 | # Note: Calling this function will implicitly remove the next link from |
| 232 | # any and all applicable hashtables/scalars. |
| 233 | # |
| 234 | # When getting the next link, 'next_link_to_visit' is checked first. |
| 235 | # If that value is undef, then the 'relative_links_to_visit' hashtable |
| 236 | # is checked next. If that hashtable is empty, then finally the |
| 237 | # 'links_to_visit' hashtable is checked. |
| 238 | # |
| 239 | # Inputs: HoneyClient::Agent::Driver::FF object |
| 240 | # Outputs: link, or undef if all applicable scalars/hashtables are empty |
| 241 | sub _getNextLink { |
| 242 | |
| 243 | # Get the object state. |
| 244 | my $self = shift; |
| 245 | |
| 246 | # Set the link to find as undef, initially. |
| 247 | my $link = undef; |
| 248 | |
| 249 | while ( !defined($link) or ($link eq "") ) { |
| 250 | |
| 251 | # Try getting the next link from the next link |
| 252 | # scalar. |
| 253 | $link = $self->next_link_to_visit; |
| 254 | $self->{next_link_to_visit} = undef; |
| 255 | |
| 256 | # If the next link scalar is empty, try |
| 257 | # getting a link from the relative hashtable. |
| 258 | unless ( defined($link) ) { |
| 259 | $link = _pop( $self->relative_links_to_visit ); |
| 260 | } |
| 261 | |
| 262 | # If the relative hashtable is empty, try getting one |
| 263 | # from the external hashtable. |
| 264 | unless ( defined($link) ) { |
| 265 | $link = _pop( $self->links_to_visit ); |
| 266 | } |
| 267 | |
| 268 | # If all hashtables/scalars were empty, immediately return an |
| 269 | # undef value. |
| 270 | unless ( defined($link) ) { |
| 271 | return $link; |
| 272 | } |
| 273 | |
| 274 | # Now, make sure the link is valid, before we return |
| 275 | # it; if it's not valid, we simply move on to the next |
| 276 | # one in our hashtables. |
| 277 | #$link = $self->_validateLink($link); |
| 278 | } |
| 279 | |
| 280 | # Return the next link found. |
| 281 | return $link; |
| 282 | } |
| 283 | ############################################################################### |
| 284 | |
| 285 | # Helper function designed to get a current timestamp from |
| 286 | # the system OS. |
| 287 | # |
| 288 | # Note: This timestamp is in ISO 8601 format. |
| 289 | # |
| 290 | # Inputs: none |
| 291 | # Outputs: timestamp |
| 292 | sub _getTimestamp { |
| 293 | my $dt = DateTime::HiRes->now(); |
| 294 | return $dt . "." . $dt->nanosecond(); |
| 295 | } |
| 296 | ############################################################################### |
| 297 | # Helper function designed to "pop" a key off a given hashtable. |
| 298 | # When given a hashtable reference, this function will extract a valid key |
| 299 | # from the hashtable and delete the (key, value) pair from the |
| 300 | # hashtable. |
| 301 | # |
| 302 | # Note: There is no guaranteed order about how this function picks |
| 303 | # keys from the hashtable. |
| 304 | # |
| 305 | # Inputs: hashref |
| 306 | # Outputs: valid key, or undef if the hash is empty |
| 307 | sub _pop { |
| 308 | |
| 309 | # Get supplied hash reference. |
| 310 | my $hash = shift; |
| 311 | |
| 312 | # Get a new key. |
| 313 | my @keys = keys( %{$hash} ); |
| 314 | my $key = pop(@keys); |
| 315 | |
| 316 | # Delete the key from the hashtable. |
| 317 | if ( defined($key) ) { |
| 318 | delete $hash->{$key}; |
| 319 | } |
| 320 | ## print "in pop " . Dumper($hash); |
| 321 | |
| 322 | # Return the key found. |
| 323 | return $key; |
| 324 | } |
| 325 | |
| 326 | ############################################################################### |
| 327 | |
| 328 | # Helper function, designed to extract the hostname |
| 329 | # (and, if it exists, the port number) from a given |
| 330 | # URL. |
| 331 | # |
| 332 | # For example, if "http://hostname.com:80/path/index.html" |
| 333 | # is given, then "hostname:80" would be returned. |
| 334 | # |
| 335 | # Inputs: URL |
| 336 | # Outputs: hostname[:port] |
| 337 | sub _extractHostname { |
| 338 | |
| 339 | # Sanity check. |
| 340 | my $arg = shift(); |
| 341 | |
| 342 | if (!defined($arg)) { |
| 343 | return ""; |
| 344 | } |
| 345 | |
| 346 | # Get the URL supplied. |
| 347 | my $url = $arg; |
| 348 | |
| 349 | # Note: The '?' chars make a critical difference |
| 350 | # in how this regex operates. |
| 351 | $url =~ s/.*?\/\/(.*?)\/.*/$1/; |
| 352 | |
| 353 | # Return the extracted hostname. |
| 354 | return $url; |
| 355 | } |
| 356 | ############################################################################### |
| 357 | |
| 358 | # Helper function, designed to process all links found at a |
| 359 | # given URL, once the browser has been driven to that URL |
| 360 | # and has collected all corresponding links. |
| 361 | # |
| 362 | # When supplied with the array of URLs |
| 363 | # this function will categorize them |
| 364 | # as follows: |
| 365 | # |
| 366 | # "New" links are those we've never driven the browser to. |
| 367 | # "Old" links are those we've driven the browser to before. |
| 368 | # |
| 369 | # - If a link is new and "invalid", then it gets added to |
| 370 | # the 'links_ignored' hashtable. |
| 371 | # |
| 372 | # - If a link is old and "invalid", then it gets |
| 373 | # ignored. |
| 374 | # |
| 375 | # - If a link is old and "valid", then it gets ignored. |
| 376 | # |
| 377 | # - If a link is new and "valid", then we check to see if |
| 378 | # the referring URL's hostname[:port] and the link's |
| 379 | # hostname[:port] match. If they match, then the link |
| 380 | # is added to the 'relative_links_to_visit' hash. |
| 381 | # Otherwise, the link is added to the 'links_to_visit' |
| 382 | # hash. |
| 383 | # |
| 384 | # Inputs: HoneyClient::Agent::Driver::FF object, |
| 385 | # referring URL, |
| 386 | # array of LinkExtor link arrays |
| 387 | # Outputs: HoneyClient::Agent::Driver::FF object |
| 388 | |
| 389 | sub _processLinks { |
| 390 | |
| 391 | # Get the object state. |
| 392 | my $self = shift; |
| 393 | |
| 394 | # Get the referrer and the corresponding array of links. |
| 395 | my ( $base_url, @links ) = @_; |
| 396 | |
| 397 | # Sanity check: Make sure we've been fed an object. |
| 398 | unless ( ref($self) ) { |
| 399 | Carp::croak "Error: Function must be called in reference to a " |
| 400 | . __PACKAGE__ |
| 401 | . "->new() object!\n"; |
| 402 | } |
| 403 | |
| 404 | foreach my $link (@links) { |
| 405 | |
| 406 | #format of this array (pointed to by the $link ref) is |
| 407 | # like ("img", "src", "$url") or ("a", "href", "$url") |
| 408 | |
| 409 | #skip some link types for now |
| 410 | if ( @{$link}[0] eq "form" |
| 411 | || @{$link}[0] eq "input" |
| 412 | || @{$link}[0] eq "script" |
| 413 | || @{$link}[0] eq "td" |
| 414 | || @{$link}[0] eq "table" ) |
| 415 | { |
| 416 | next; |
| 417 | } |
| 418 | |
| 419 | #Need to put these in a bucket so the firewall knows to allow access to them |
| 420 | if ( @{$link}[0] eq "img" |
| 421 | || @{$link}[0] eq "object" |
| 422 | || @{$link}[0] eq "embed" ) |
| 423 | { |
| 424 | |
| 425 | # lazy way, just absolutize right away incased it's a relative URL |
| 426 | # this should catch any time a base href is actually used |
| 427 | my $tmp_hn = |
| 428 | _extractHostname( URI->new_abs( "@{$link}[2]", "$base_url" ) ); |
| 429 | ## print "@{$link}[0] @{$link}[2] $tmp_hn\n"; |
| 430 | my $tmp_port; |
| 431 | |
| 432 | #if it already has a port appended use that, otherwise set it to the default |
| 433 | if ( $tmp_hn =~ /.*?:(.*)/ ) { |
| 434 | $tmp_port = $1; |
| 435 | } |
| 436 | else { |
| 437 | |
| 438 | #Can you have image links to other protocols? |
| 439 | #if(@{$link}[2] =~ /^ftp/) {$tmp_port = 21;} |
| 440 | if ( @{$link}[2] =~ /^http/ ) { $tmp_port = 80; } |
| 441 | if ( @{$link}[2] =~ /^https/ ) { $tmp_port = 443; } |
| 442 | else { $tmp_port = 80; } |
| 443 | } |
| 444 | if(!exists($self->{hosts_to_resolve}{$tmp_hn})){ |
| 445 | $self->{hosts_to_resolve}{$tmp_hn} = $tmp_port; |
| 446 | print "adding $tmp_hn to hosts_to_resolve bucket in _processLinks\n"; |
| 447 | } |
| 448 | |
| 449 | next; |
| 450 | |
| 451 | } |
| 452 | |
| 453 | if ( @{$link}[0] ne "a" |
| 454 | && @{$link}[0] ne "link" |
| 455 | && @{$link}[0] ne "area" |
| 456 | && @{$link}[0] ne "iframe" ) |
| 457 | { |
| 458 | print "new type of link!: @{$link}\n"; |
| 459 | $self->links_ignored->{ @{$link}[2] } = _getTimestamp(); |
| 460 | next; |
| 461 | } |
| 462 | |
| 463 | #strip anchors |
| 464 | @{$link}[2] =~ s/\#.*//; |
| 465 | |
| 466 | #trying to avoid links already in the buckets |
| 467 | if ( $self->_preexisting( @{$link}[2] ) |
| 468 | || @{$link}[2] eq $self->next_link_to_visit |
| 469 | || "@{$link}[2]/" eq $self->next_link_to_visit ) |
| 470 | { |
| 471 | next; |
| 472 | } |
| 473 | |
| 474 | # print "@{$link}\n"; |
| 475 | |
| 476 | #starts with http(s):// |
| 477 | # Check to make sure it's not an absolute link to a site we're already on |
| 478 | # The second case of the if() is "the slashdot exception" ;) |
| 479 | if ( @{$link}[2] =~ /^http[s]?:\/\//i || @{$link}[2] =~ /^\/\//i ) { |
| 480 | my $tmp_url; |
| 481 | if ( _extractHostname( @{$link}[2] ) eq |
| 482 | _extractHostname( $self->{next_link_to_visit} ) ) |
| 483 | { |
| 484 | # URI->new_abs() will never return undef; however, |
| 485 | # it could die if invalid parameters are supplied. |
| 486 | $tmp_url = URI->new_abs( "@{$link}[2]", "$base_url" ); |
| 487 | $self->relative_links_to_visit->{$tmp_url} = 1; |
| 488 | next; |
| 489 | } |
| 490 | else { |
| 491 | # URI->new_abs() will never return undef; however, |
| 492 | # it could die if invalid parameters are supplied. |
| 493 | $tmp_url = URI->new_abs( "@{$link}[2]", "$base_url" ); |
| 494 | $self->links_to_visit->{$tmp_url} = 1; |
| 495 | next; |
| 496 | } |
| 497 | } |
| 498 | |
| 499 | #ignore mailto and javascript links |
| 500 | if ( @{$link}[2] =~ /^mailto/ |
| 501 | || @{$link}[2] =~ /;$/ |
| 502 | || @{$link}[2] =~ /^javascript:/ ) |
| 503 | { |
| 504 | $self->links_ignored->{ @{$link}[2] } = _getTimestamp(); |
| 505 | next; |
| 506 | } |
| 507 | |
| 508 | #ignore any other URI types for now since we've already got http(s) links |
| 509 | if ( @{$link}[2] =~ /^\w*:\/\// ) { |
| 510 | $self->links_ignored->{ @{$link}[2] } = _getTimestamp(); |
| 511 | next; |
| 512 | } |
| 513 | |
| 514 | #From here on out it should be a relative link, so first we absolutize it |
| 515 | # URI->new_abs() will never return undef; however, |
| 516 | # it could die if invalid parameters are supplied. |
| 517 | my $uri = URI->new_abs( "@{$link}[2]", "$base_url" ); |
| 518 | |
| 519 | # Then we check to make sure the absolute link isn't a variant of the page |
| 520 | # we got it from, and not already visited |
| 521 | if ( $uri ne $base_url |
| 522 | && $uri ne "$base_url/" |
| 523 | && $uri ne $self->next_link_to_visit |
| 524 | && "$uri/" ne $self->next_link_to_visit ) |
| 525 | { |
| 526 | |
| 527 | if ( !$self->_preexisting($uri) ) { |
| 528 | |
| 529 | # print "2 adding (formerly) relative link $uri to the relative links to visit\n"; |
| 530 | $self->relative_links_to_visit->{$uri} = 1; |
| 531 | } |
| 532 | } |
| 533 | |
| 534 | } |
| 535 | |
| 536 | # Return the modified object state. |
| 537 | return $self; |
| 538 | } |
| 539 | |
| 540 | ############################################################################### |
| 541 | |
| 542 | # returns 1 if a URI exists somewhere in the buckets |
| 543 | # otherwise returns 0 |
| 544 | |
| 545 | # All the commented out code is related to whether we should test for the |
| 546 | # existence of a link that is the same as the current link except possibly |
| 547 | # having or missing a trailing slash. |
| 548 | # Currently that code is not used, and therefore it will treat |
| 549 | # http://www.foo.com/bar |
| 550 | # as a different link than |
| 551 | # http://www.foo.com/bar/ |
| 552 | # And therefore go to both link independantly |
| 553 | # Comments on whether this should be the case or not are greatly appreciated |
| 554 | |
| 555 | sub _preexisting { |
| 556 | my $self = shift; |
| 557 | my $uri = shift; |
| 558 | |
| 559 | # if($uri !~ /\/$/){ |
| 560 | #try as is only (currently...and with trailing slash maybe later) |
| 561 | if ( |
| 562 | exists( $self->links_to_visit->{$uri} ) |
| 563 | || |
| 564 | |
| 565 | # $self->links_to_visit->{"$uri/"} || |
| 566 | exists( $self->relative_links_to_visit->{$uri} ) || |
| 567 | |
| 568 | # $self->relative_links_to_visit->{"$uri/"} || |
| 569 | exists( $self->links_ignored->{$uri} ) || |
| 570 | |
| 571 | # $self->links_ignored->{"$uri/"} || |
| 572 | exists( $self->links_visited->{$uri} ) |
| 573 | ) |
| 574 | { |
| 575 | |
| 576 | # $self->links_visited->{"$uri/"} ){ |
| 577 | return 1; |
| 578 | } |
| 579 | |
| 580 | # } |
| 581 | # else{ |
| 582 | #try without a trailing slash |
| 583 | # $uri =~ s/\/$//; |
| 584 | # if( $self->links_to_visit->{$uri} || |
| 585 | # $self->relative_links_to_visit->{$uri} || |
| 586 | # $self->links_ignored->{$uri} || |
| 587 | # $self->links_visited->{$uri} ){ |
| 588 | # return 1; |
| 589 | # } |
| 590 | # } |
| 591 | |
| 592 | return 0; |
| 593 | } |
| 594 | |
| 595 | ############################################################################### |
| 596 | # Public Methods Implemented # |
| 597 | ############################################################################### |
| 598 | |
| 599 | =pod |
| 600 | |
| 601 | =head1 METHODS IMPLEMENTED |
| 602 | |
| 603 | The following functions have been implemented by the FF driver. Many |
| 604 | of these methods were implementations of the parent Driver interface. |
| 605 | |
| 606 | As such, the following code descriptions pertain to this particular |
| 607 | Driver implementation. For further information about the generic |
| 608 | Driver interface, see the L<HoneyClient::Agent::Driver> documentation. |
| 609 | |
| 610 | =head2 HoneyClient::Agent::Driver::FF->new($param => $value, ...) |
| 611 | |
| 612 | =over 4 |
| 613 | |
| 614 | Creates a new FF driver object, which contains a hashtable |
| 615 | containing any of the supplied "param => value" arguments. |
| 616 | |
| 617 | I<Inputs>: |
| 618 | B<$param> is an optional parameter variable. |
| 619 | B<$value> is $param's corresponding value. |
| 620 | |
| 621 | Note: If any $param(s) are supplied, then an equal number of |
| 622 | corresponding $value(s) B<must> also be specified. |
| 623 | |
| 624 | I<Output>: The instantiated FF driver B<$object>, fully initialized. |
| 625 | |
| 626 | =back |
| 627 | |
| 628 | =begin testing |
| 629 | |
| 630 | # XXX: Add this. |
| 631 | 1; |
| 632 | |
| 633 | =end testing |
| 634 | |
| 635 | =cut |
| 636 | |
| 637 | sub new { |
| 638 | |
| 639 | # - This function takes in an optional hashtable, |
| 640 | # that contains various key => 'value' configuration |
| 641 | # parameters. |
| 642 | # |
| 643 | # - For each parameter given, it overwrites any corresponding |
| 644 | # parameters specified within the default hashtable, %PARAMS, |
| 645 | # with custom entries that were given as parameters. |
| 646 | # |
| 647 | # - Finally, it returns a blessed instance of the |
| 648 | # merged hashtable, as an 'object'. |
| 649 | |
| 650 | # Get the class name. |
| 651 | my $self = shift; |
| 652 | |
| 653 | # Get the rest of the arguments, as a hashtable. |
| 654 | # Hash-based arguments are used, since HoneyClient::Util::SOAP is unable to handle |
| 655 | # hash references directly. Thus, flat hashtables are used throughout the code |
| 656 | # for consistency. |
| 657 | my %args = @_; |
| 658 | |
| 659 | # Check to see if the class name is inherited or defined. |
| 660 | my $class = ref($self) || $self; |
| 661 | |
| 662 | # Initialize default parameters. |
| 663 | my %params = %{ dclone( \%PARAMS ) }; |
| 664 | $self = $class->SUPER::new(); |
| 665 | @{$self}{ keys %params } = values %params; |
| 666 | |
| 667 | # Now, overwrite any default parameters that were redefined |
| 668 | # in the supplied arguments. |
| 669 | @{$self}{ keys %args } = values %args; |
| 670 | |
| 671 | # Now, assign our object the appropriate namespace. |
| 672 | bless $self, $class; |
| 673 | |
| 674 | #bla |
| 675 | $self->{_remaining_number_of_relative_links_to_visit} = |
| 676 | $self->{max_relative_links_to_visit}; |
| 677 | |
| 678 | # Finally, return the blessed object. |
| 679 | return $self; |
| 680 | } |
| 681 | |
| 682 | ############################################################################### |
| 683 | |
| 684 | =pod |
| 685 | |
| 686 | =head2 $object->drive() |
| 687 | |
| 688 | =over 4 |
| 689 | |
| 690 | Drives an instance of Firefox for one iteration, |
| 691 | navigating to the next URL and updating the driver's corresponding |
| 692 | internal hashtables accordingly. |
| 693 | |
| 694 | For a description of which hashtable is consulted upon each |
| 695 | iteration of drive(), see the L<next_link_to_visit> documentation, in |
| 696 | the "DEFAULT PARAMETER LIST" section. |
| 697 | |
| 698 | Once a drive() iteration has completed, the corresponding Firefox browser |
| 699 | process is terminated. Thus, each call to drive() invokes a new instance of |
| 700 | the browser. |
| 701 | |
| 702 | I<Output>: The updated FF driver B<$object>, containing state information |
| 703 | from driving Firefox for one iteration. |
| 704 | |
| 705 | B<Warning>: This method will B<croak> if the FF driver object is B<unable> |
| 706 | to navigate to a new link, because its list of links to visit is empty. |
| 707 | |
| 708 | =back |
| 709 | |
| 710 | =begin testing |
| 711 | |
| 712 | # XXX: Test this. |
| 713 | 1; |
| 714 | |
| 715 | =end testing |
| 716 | |
| 717 | =cut |
| 718 | |
| 719 | sub drive { |
| 720 | |
| 721 | # Get the object state. |
| 722 | my $self = shift; |
| 723 | my $base_url = undef; |
| 724 | |
| 725 | # my $drive_ff = 1; # To be set to one for some corner cases where |
| 726 | # we don't want the real browser to visit a site |
| 727 | |
| 728 | # Sanity check: Make sure we've been fed an object. |
| 729 | unless ( ref($self) ) { |
| 730 | Carp::croak "Error: Function must be called in reference to a " |
| 731 | . __PACKAGE__ |
| 732 | . "->new() object!\n"; |
| 733 | } |
| 734 | |
| 735 | # Get the next URL from our hashtables. |
| 736 | my $url = $self->_dep_getNextLink(); |
| 737 | |
| 738 | # Sanity check: Make sure our next URL is defined. |
| 739 | unless ( defined($url) ) { |
| 740 | Carp::croak "Error: Unable to drive browser - 'links_to_visit' " |
| 741 | . "hashtable is empty!\n"; |
| 742 | } |
| 743 | |
| 744 | ########################### |
| 745 | #Drive the LWP-based "browser" |
| 746 | ########################### |
| 747 | if ( $self->{flip_flop} == 0 ) { |
| 748 | print "LWP browsing to $url\n"; |
| 749 | $self->{drive_ff} = 1; |
| 750 | my $ua = LWP::UserAgent->new( "keep_alive" => 300, max_redirect => 0, |
| 751 | timeout => $self->{timeout}); |
| 752 | if($self->{http_proxy} ne 'none'){ |
| 753 | $ua->proxy( 'http', $self->{http_proxy}); |
| 754 | } |
| 755 | |
| 756 | #Set up the headers to mimic FF |
| 757 | #TODO: LWP::UserAgent headers are not *exactly* the same as Firefox still |
| 758 | # this is because if I turn off the TE header manually in the .pm file |
| 759 | # and if I set the 'Accept-Encoding' to 'gzip,deflate' (which is what is |
| 760 | # in the TE header too) it still doesn't process compressed info correctly |
| 761 | $ua->agent( |
| 762 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.5' |
| 763 | ); |
| 764 | $ua->default_header( 'Accept' => |
| 765 | 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5' |
| 766 | ); |
| 767 | $ua->default_header( 'Accept-Language' => 'en-us,en;q=0.5' ); |
| 768 | $ua->default_header( |
| 769 | 'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7' ); |
| 770 | |
| 771 | # $ua->default_header('Accept-Encoding' => 'gzip,deflate'); |
| 772 | |
| 773 | # get() returns an HTTP::Response object |
| 774 | # and the raw html will be in $data->content |
| 775 | my $data = $ua->get($url); |
| 776 | if ( !defined($data) ) { |
| 777 | print "got back a bad HTTP::Response object. Dying\n"; |
| 778 | $self->{next_link_to_visit} = undef; |
| 779 | exit; |
| 780 | } |
| 781 | |
| 782 | if ( $data->is_redirect ) { |
| 783 | print "GOT REDIRECTED trying to go to $url\n"; |
| 784 | $self->links_visited->{$url} = _getTimestamp() . ":http_refresh"; |
| 785 | $self->{drive_ff} = 0; |
| 786 | } |
| 787 | |
| 788 | if ( $data->content() =~ /^[0-9][0-9][0-9]/ ) { |
| 789 | |
| 790 | # In practice I have only seen an error message starting with 500 |
| 791 | # to get to this case.(when the socket connect fails or your ctrl-c) |
| 792 | # - Xeno |
| 793 | my $tmp = $data->content(); |
| 794 | chomp($tmp); |
| 795 | print "Error: $tmp. ABORTING\n"; |
| 796 | $self->{next_link_to_visit} = undef; |
| 797 | return -1; |
| 798 | } |
| 799 | |
| 800 | #Before we give it to LinkExtor, check for any meta refresh URLs |
| 801 | my $p = HTML::HeadParser->new(); |
| 802 | # TODO: Check to make sure the parser is functioning properly. |
| 803 | $p->parse( $data->content() ); |
| 804 | |
| 805 | #The HeadParser doesn't get us all the way there, still need regex |
| 806 | ### print "HEADER\n" . Dumper($p->header) . "\n"; |
| 807 | if ( defined( $p->header->{refresh} ) |
| 808 | && $p->header->{refresh} =~ /.*URL=(.*)/i ) |
| 809 | { |
| 810 | print "META REFRESH going to $1\n"; |
| 811 | |
| 812 | #Just pretend we went there |
| 813 | #Also absolutize just incase (since I've seen it needed) |
| 814 | # URI->new_abs() will never return undef; however, |
| 815 | # it could die if invalid parameters are supplied. |
| 816 | my $uri = URI->new_abs($1, $url); |
| 817 | |
| 818 | $self->links_to_visit->{$uri} = _getTimestamp(); |
| 819 | $self->links_visited->{$url} = _getTimestamp() . ":meta_refresh"; |
| 820 | |
| 821 | # TODO: Should *really* only not go if the refresh timer is less than |
| 822 | # our timeout...Currently it ignores the time alltogether |
