| 105 | | my %PARAMS = ( |
|---|
| 106 | | |
|---|
| 107 | | # This is a hashtable of fully qualified URLs |
|---|
| 108 | | # to visit by the browser. Specifically, the 'key' is |
|---|
| 109 | | # the absolute URL and the 'value' is always 1. |
|---|
| 110 | | links_to_visit => {}, |
|---|
| 111 | | |
|---|
| 112 | | # This is a hashtable of fully qualified URLs that the |
|---|
| 113 | | # browser has already visited. Specifically, the |
|---|
| 114 | | # 'key' is the absolute URL and the 'value' is a string |
|---|
| 115 | | # representing the date and time of when the link was visited. |
|---|
| 116 | | # |
|---|
| 117 | | # Note: See _getTimestamp() for the corresponding date/time |
|---|
| 118 | | # format. |
|---|
| 119 | | links_visited => {}, |
|---|
| 120 | | |
|---|
| 121 | | # This is a hashtable of URLs that the browser has found |
|---|
| 122 | | # during its traversal process, but the browser could not |
|---|
| 123 | | # access the link. |
|---|
| 124 | | # |
|---|
| 125 | | # Links could be added to this list if access requires any type of |
|---|
| 126 | | # authentication, or if the link points to a non-HTTP or HTTPS |
|---|
| 127 | | # resource (i.e., "javascript:doNetDetect()"). |
|---|
| 128 | | # |
|---|
| 129 | | # The 'key' is the absolute URL and the 'value' is a string |
|---|
| 130 | | # representing the date and time of when the link was visited. |
|---|
| 131 | | # |
|---|
| 132 | | # Note: See _getTimestamp() for the corresponding date/time |
|---|
| 133 | | # format. |
|---|
| 134 | | links_ignored => {}, |
|---|
| 135 | | |
|---|
| 136 | | # This is a hashtable of fully qualified URLs |
|---|
| 137 | | # that all share a common *hostname*. This hashtable should be |
|---|
| 138 | | # initially empty. As the driver extracts and removes new URLs |
|---|
| 139 | | # off the 'links_to_visit' hashtable, driving the browser to each URL, |
|---|
| 140 | | # any *relative* links found are added into this hashtable; any |
|---|
| 141 | | # *external* links found are added back into the 'links_to_visit' |
|---|
| 142 | | # hashtable. |
|---|
| 143 | | # |
|---|
| 144 | | # When navigating to the next link, this hashtable is exhausted prior |
|---|
| 145 | | # to the main 'links_to_visit' hashtable. This allows a |
|---|
| 146 | | # browser to navigate to all links hosted on the same server, prior |
|---|
| 147 | | # to contacting a different server. |
|---|
| 148 | | # |
|---|
| 149 | | # Specifically, the 'key' is the absolute URL and the 'value' |
|---|
| 150 | | # is always 1. |
|---|
| 151 | | relative_links_to_visit => {}, |
|---|
| 152 | | |
|---|
| 153 | | # This is a scalar that contains the next URL to visit. |
|---|
| 154 | | # It is updated dynamically, any time _dep_getNextLink() is called. |
|---|
| 155 | | # When the browser is ready to drive to the next link, |
|---|
| 156 | | # 'next_link_to_visit' is checked. If that value is undef, then |
|---|
| 157 | | # the 'relative_links_to_visit' hashtable is checked next. |
|---|
| 158 | | # If that hashtable is empty, then finally the 'links_to_visit' |
|---|
| 159 | | # hashtable is checked. |
|---|
| 160 | | next_link_to_visit => undef, |
|---|
| 161 | | |
|---|
| 162 | | # This is a hashtable of URLs that the browser has found |
|---|
| 163 | | # during its traversal process, but the browser could not |
|---|
| 164 | | # access the resource due to the operation timing out. |
|---|
| 165 | | # |
|---|
| 166 | | # The 'key' is the absolute URL and the 'value' is a string |
|---|
| 167 | | # representing the date and time of when the link was visited. |
|---|
| 168 | | # |
|---|
| 169 | | # Note: See _getTimestamp() for the corresponding date/time |
|---|
| 170 | | # format. |
|---|
| 171 | | links_timed_out => {}, |
|---|
| 172 | | |
|---|
| 173 | | # Absolute path to firefox executable. Needs to be in Windows |
|---|
| 174 | | # path format not Cygwin, because it's used in Win32::Job |
|---|
| 175 | | ff_exec => getVar(name => "ff_exec"), |
|---|
| 176 | | |
|---|
| 177 | | # Hashtable used to hold hostnames and ports for any other sites which may |
|---|
| 178 | | # need to be looked up (for instance external image URLs) |
|---|
| 179 | | # Entries are in the form |
|---|
| 180 | | # {hostname => port, ...} |
|---|
| 181 | | hosts_to_resolve => {}, |
|---|
| 182 | | |
|---|
| 183 | | # A switch which can be turned off (set to 0) to decide NOT to drive ff |
|---|
| 184 | | # to a particularly instance of a URL. For instance because the URL in |
|---|
| 185 | | # question actually redirects somewhere else and you don't want to get |
|---|
| 186 | | # a false positive on the firewall. |
|---|
| 187 | | drive_ff => 0, |
|---|
| 188 | | |
|---|
| 189 | | # Because the FF.pm drive() function actually uses the perl LWP::UserAgent |
|---|
| 190 | | # to actually process a page and find links, there are actually two things |
|---|
| 191 | | # going on inside this function. Because of the interaction between next() |
|---|
| 192 | | # and drive() it is therefore necessary to pop back out of the drive() |
|---|
| 193 | | # after it's got the links but before it drives ff, so that the data can |
|---|
| 194 | | # be used by next() and given back to the firewall before it drives ff. |
|---|
| 195 | | # This toggle lets every other instance of drive() either use the |
|---|
| 196 | | # LWP::UserAgent or Firefox. |
|---|
| 197 | | flip_flop => 0, |
|---|
| 198 | | |
|---|
| 199 | | # An integer, representing how many relative links the browser |
|---|
| 200 | | # should continue to drive to, before moving onto another |
|---|
| 201 | | # website. If negative, then the browser will exhaust all possible |
|---|
| 202 | | # relative links, before moving on. (This internal variable should |
|---|
| 203 | | # never be modified externally.) |
|---|
| 204 | | max_relative_links_to_visit => getVar(name => "max_relative_links_to_visit"), |
|---|
| 205 | | |
|---|
| 206 | | # NOTE: Don't delete me unless you delete max_relative_links_to_visit |
|---|
| 207 | | # An integer, representing the maximum number of relative links that |
|---|
| 208 | | # the browser should visit, before moving onto another website. If |
|---|
| 209 | | # negative, then the browser will exhaust all possible relative links |
|---|
| 210 | | # found, before moving on. This functionality is best effort; it's |
|---|
| 211 | | # possible for the browser to visit new links on previously visited |
|---|
| 212 | | # websites. |
|---|
| 213 | | _remaining_number_of_relative_links_to_visit => getVar(name => "max_relative_links_to_visit"), |
|---|
| 214 | | |
|---|
| 215 | | #COPY DARIEN'S COMMENTS! YAY! (or not, boo!)) |
|---|
| 216 | | _next_connections => {}, |
|---|
| 217 | | |
|---|
| 218 | | #This is here so that it doesn't have to do a getVar every time it wants to run |
|---|
| 219 | | # the LWP stuff. Can maybe be moved into the big datastructure. |
|---|
| 220 | | http_proxy => getVar(name => "http_proxy"), |
|---|
| 221 | | |
|---|
| 222 | | ); |
|---|
| 223 | | |
|---|
| 224 | | ############################################################################### |
|---|
| 225 | | # Private Methods Implemented # |
|---|
| 226 | | ############################################################################### |
|---|
| 227 | | |
|---|
| 228 | | # Helper function designed to retrieve the next link for the browser |
|---|
| 229 | | # to navigate to. |
|---|
| 230 | | # |
|---|
| 231 | | # Note: Calling this function will implicitly remove the next link from |
|---|
| 232 | | # any and all applicable hashtables/scalars. |
|---|
| 233 | | # |
|---|
| 234 | | # When getting the next link, 'next_link_to_visit' is checked first. |
|---|
| 235 | | # If that value is undef, then the 'relative_links_to_visit' hashtable |
|---|
| 236 | | # is checked next. If that hashtable is empty, then finally the |
|---|
| 237 | | # 'links_to_visit' hashtable is checked. |
|---|
| 238 | | # |
|---|
| 239 | | # Inputs: HoneyClient::Agent::Driver::FF object |
|---|
| 240 | | # Outputs: link, or undef if all applicable scalars/hashtables are empty |
|---|
| 241 | | sub _getNextLink { |
|---|
| 242 | | |
|---|
| 243 | | # Get the object state. |
|---|
| 244 | | my $self = shift; |
|---|
| 245 | | |
|---|
| 246 | | # Set the link to find as undef, initially. |
|---|
| 247 | | my $link = undef; |
|---|
| 248 | | |
|---|
| 249 | | while ( !defined($link) or ($link eq "") ) { |
|---|
| 250 | | |
|---|
| 251 | | # Try getting the next link from the next link |
|---|
| 252 | | # scalar. |
|---|
| 253 | | $link = $self->next_link_to_visit; |
|---|
| 254 | | $self->{next_link_to_visit} = undef; |
|---|
| 255 | | |
|---|
| 256 | | # If the next link scalar is empty, try |
|---|
| 257 | | # getting a link from the relative hashtable. |
|---|
| 258 | | unless ( defined($link) ) { |
|---|
| 259 | | $link = _pop( $self->relative_links_to_visit ); |
|---|
| 260 | | } |
|---|
| 261 | | |
|---|
| 262 | | # If the relative hashtable is empty, try getting one |
|---|
| 263 | | # from the external hashtable. |
|---|
| 264 | | unless ( defined($link) ) { |
|---|
| 265 | | $link = _pop( $self->links_to_visit ); |
|---|
| 266 | | } |
|---|
| 267 | | |
|---|
| 268 | | # If all hashtables/scalars were empty, immediately return an |
|---|
| 269 | | # undef value. |
|---|
| 270 | | unless ( defined($link) ) { |
|---|
| 271 | | return $link; |
|---|
| 272 | | } |
|---|
| 273 | | |
|---|
| 274 | | # Now, make sure the link is valid, before we return |
|---|
| 275 | | # it; if it's not valid, we simply move on to the next |
|---|
| 276 | | # one in our hashtables. |
|---|
| 277 | | #$link = $self->_validateLink($link); |
|---|
| 278 | | } |
|---|
| 279 | | |
|---|
| 280 | | # Return the next link found. |
|---|
| 281 | | return $link; |
|---|
| | 121 | # Use threads Library |
|---|
| | 122 | # TODO: Need unit testing. |
|---|
| | 123 | use threads; |
|---|
| | 124 | |
|---|
| | 125 | # TODO: Need unit testing. |
|---|
| | 126 | use threads::shared; |
|---|
| | 127 | |
|---|
| | 128 | # TODO: Need unit testing. |
|---|
| | 129 | use HoneyClient::Util::SOAP qw(getClientHandle); |
|---|
| | 130 | |
|---|
| | 131 | # TODO: Need unit testing. |
|---|
| | 132 | use Win32::Job; |
|---|
| | 133 | |
|---|
| | 134 | # TODO: clean this up. |
|---|
| | 135 | #my %PARAMS = ( |
|---|
| | 136 | #); |
|---|
| | 137 | |
|---|
| | 138 | ####################################################################### |
|---|
| | 139 | # Private Methods Implemented # |
|---|
| | 140 | ####################################################################### |
|---|
| | 141 | |
|---|
| | 142 | #sub new { |
|---|
| | 143 | # |
|---|
| | 144 | # # - This function takes in an optional hashtable, |
|---|
| | 145 | # # that contains various key => 'value' configuration |
|---|
| | 146 | # # parameters. |
|---|
| | 147 | # # |
|---|
| | 148 | # # - For each parameter given, it overwrites any corresponding |
|---|
| | 149 | # # parameters specified within the default hashtable, %PARAMS, |
|---|
| | 150 | # # with custom entries that were given as parameters. |
|---|
| | 151 | # # |
|---|
| | 152 | # # - Finally, it returns a blessed instance of the |
|---|
| | 153 | # # merged hashtable, as an 'object'. |
|---|
| | 154 | # |
|---|
| | 155 | # # Get the class name. |
|---|
| | 156 | # my $self = shift; |
|---|
| | 157 | # |
|---|
| | 158 | # # Get the rest of the arguments, as a hashtable. |
|---|
| | 159 | # # Hash-based arguments are used, since HoneyClient::Util::SOAP is unable to handle |
|---|
| | 160 | # # hash references directly. Thus, flat hashtables are used throughout the code |
|---|
| | 161 | # # for consistency. |
|---|
| | 162 | # my %args = @_; |
|---|
| | 163 | # |
|---|
| | 164 | # # Check to see if the class name is inherited or defined. |
|---|
| | 165 | # my $class = ref($self) || $self; |
|---|
| | 166 | # |
|---|
| | 167 | # # Initialize default parameters. |
|---|
| | 168 | # my %params = %{dclone(\%PARAMS)}; |
|---|
| | 169 | # $self = $class->SUPER::new(); |
|---|
| | 170 | # @{$self}{keys %params} = values %params; |
|---|
| | 171 | # |
|---|
| | 172 | # # Now, overwrite any default parameters that were redefined |
|---|
| | 173 | # # in the supplied arguments. |
|---|
| | 174 | # @{$self}{keys %args} = values %args; |
|---|
| | 175 | # |
|---|
| | 176 | # # Now, assign our object the appropriate namespace. |
|---|
| | 177 | # bless $self, $class; |
|---|
| | 178 | # |
|---|
| | 179 | # # Finally, return the blessed object. |
|---|
| | 180 | # return $self; |
|---|
| | 181 | #} |
|---|
| | 182 | |
|---|
| | 183 | sub drive { |
|---|
| | 184 | |
|---|
| | 185 | # Extract arguments. |
|---|
| | 186 | my ($self, %args) = @_; |
|---|
| | 187 | |
|---|
| | 188 | # Sanity check: Make sure we've been fed an object. |
|---|
| | 189 | unless (ref($self)) { |
|---|
| | 190 | Carp::croak "Error: Function must be called in reference to a " . |
|---|
| | 191 | __PACKAGE__ . "->new() object!\n"; |
|---|
| | 192 | } |
|---|
| | 193 | |
|---|
| | 194 | # Sanity check, don't get the next link, if |
|---|
| | 195 | # we've been fed a url. |
|---|
| | 196 | my $argsExist = scalar(%args); |
|---|
| | 197 | if (!$argsExist || |
|---|
| | 198 | !exists($args{'url'}) || |
|---|
| | 199 | !defined($args{'url'})) { |
|---|
| | 200 | # Get the next URL from our hashtables. |
|---|
| | 201 | $args{'url'} = $self->_getNextLink(); |
|---|
| | 202 | } |
|---|
| | 203 | |
|---|
| | 204 | # Drive the generic browser before opening with IE |
|---|
| | 205 | $self = $self->SUPER::drive(%args); |
|---|
| | 206 | |
|---|
| | 207 | # Sanity check: Make sure our next URL is defined. |
|---|
| | 208 | unless (defined($args{'url'})) { |
|---|
| | 209 | Carp::croak "Error: Unable to drive browser - 'links_to_visit' " . |
|---|
| | 210 | "hashtable is empty!\n"; |
|---|
| | 211 | } |
|---|
| | 212 | |
|---|
| | 213 | # Indicates how long we wait for each drive operation to complete, |
|---|
| | 214 | # before registering attempt as a failure. |
|---|
| | 215 | my $timeout : shared = $self->timeout(); |
|---|
| | 216 | |
|---|
| | 217 | # Create a new Job. |
|---|
| | 218 | my $job = Win32::Job->new(); |
|---|
| | 219 | |
|---|
| | 220 | # Sanity check. |
|---|
| | 221 | if (!defined($job)) { |
|---|
| | 222 | Carp::croak "Error: Unable to spawn new job - " . $^E . ".\n"; |
|---|
| | 223 | } |
|---|
| | 224 | |
|---|
| | 225 | # Spawn the job. |
|---|
| | 226 | $job->spawn(undef, "\"C:\\Program Files\\Mozilla Firefox\\firefox.exe\"" . $args{'url'}); |
|---|
| | 227 | |
|---|
| | 228 | # TODO: check to see if spawn fails. |
|---|
| | 229 | |
|---|
| | 230 | # Run the job. |
|---|
| | 231 | $job->run($timeout); |
|---|
| | 232 | |
|---|
| | 233 | # TODO: check to see if run fails. |
|---|
| | 234 | |
|---|
| | 235 | # Return the modified object state. |
|---|
| | 236 | return $self; |
|---|
| 283 | | ############################################################################### |
|---|
| 284 | | |
|---|
| 285 | | # Helper function designed to get a current timestamp from |
|---|
| 286 | | # the system OS. |
|---|
| 287 | | # |
|---|
| 288 | | # Note: This timestamp is in ISO 8601 format. |
|---|
| 289 | | # |
|---|
| 290 | | # Inputs: none |
|---|
| 291 | | # Outputs: timestamp |
|---|
| 292 | | sub _getTimestamp { |
|---|
| 293 | | my $dt = DateTime::HiRes->now(); |
|---|
| 294 | | return $dt . "." . $dt->nanosecond(); |
|---|
| 295 | | } |
|---|
| 296 | | ############################################################################### |
|---|
| 297 | | # Helper function designed to "pop" a key off a given hashtable. |
|---|
| 298 | | # When given a hashtable reference, this function will extract a valid key |
|---|
| 299 | | # from the hashtable and delete the (key, value) pair from the |
|---|
| 300 | | # hashtable. |
|---|
| 301 | | # |
|---|
| 302 | | # Note: There is no guaranteed order about how this function picks |
|---|
| 303 | | # keys from the hashtable. |
|---|
| 304 | | # |
|---|
| 305 | | # Inputs: hashref |
|---|
| 306 | | # Outputs: valid key, or undef if the hash is empty |
|---|
| 307 | | sub _pop { |
|---|
| 308 | | |
|---|
| 309 | | # Get supplied hash reference. |
|---|
| 310 | | my $hash = shift; |
|---|
| 311 | | |
|---|
| 312 | | # Get a new key. |
|---|
| 313 | | my @keys = keys( %{$hash} ); |
|---|
| 314 | | my $key = pop(@keys); |
|---|
| 315 | | |
|---|
| 316 | | # Delete the key from the hashtable. |
|---|
| 317 | | if ( defined($key) ) { |
|---|
| 318 | | delete $hash->{$key}; |
|---|
| 319 | | } |
|---|
| 320 | | ## print "in pop " . Dumper($hash); |
|---|
| 321 | | |
|---|
| 322 | | # Return the key found. |
|---|
| 323 | | return $key; |
|---|
| 324 | | } |
|---|
| 325 | | |
|---|
| 326 | | ############################################################################### |
|---|
| 327 | | |
|---|
| 328 | | # Helper function, designed to extract the hostname |
|---|
| 329 | | # (and, if it exists, the port number) from a given |
|---|
| 330 | | # URL. |
|---|
| 331 | | # |
|---|
| 332 | | # For example, if "http://hostname.com:80/path/index.html" |
|---|
| 333 | | # is given, then "hostname:80" would be returned. |
|---|
| 334 | | # |
|---|
| 335 | | # Inputs: URL |
|---|
| 336 | | # Outputs: hostname[:port] |
|---|
| 337 | | sub _extractHostname { |
|---|
| 338 | | |
|---|
| 339 | | # Sanity check. |
|---|
| 340 | | my $arg = shift(); |
|---|
| 341 | | |
|---|
| 342 | | if (!defined($arg)) { |
|---|
| 343 | | return ""; |
|---|
| 344 | | } |
|---|
| 345 | | |
|---|
| 346 | | # Get the URL supplied. |
|---|
| 347 | | my $url = $arg; |
|---|
| 348 | | |
|---|
| 349 | | # Note: The '?' chars make a critical difference |
|---|
| 350 | | # in how this regex operates. |
|---|
| 351 | | $url =~ s/.*?\/\/(.*?)\/.*/$1/; |
|---|
| 352 | | |
|---|
| 353 | | # Return the extracted hostname. |
|---|
| 354 | | return $url; |
|---|
| 355 | | } |
|---|
| 356 | | ############################################################################### |
|---|
| 357 | | |
|---|
| 358 | | # Helper function, designed to process all links found at a |
|---|
| 359 | | # given URL, once the browser has been driven to that URL |
|---|
| 360 | | # and has collected all corresponding links. |
|---|
| 361 | | # |
|---|
| 362 | | # When supplied with the array of URLs |
|---|
| 363 | | # this function will categorize them |
|---|
| 364 | | # as follows: |
|---|
| 365 | | # |
|---|
| 366 | | # "New" links are those we've never driven the browser to. |
|---|
| 367 | | # "Old" links are those we've driven the browser to before. |
|---|
| 368 | | # |
|---|
| 369 | | # - If a link is new and "invalid", then it gets added to |
|---|
| 370 | | # the 'links_ignored' hashtable. |
|---|
| 371 | | # |
|---|
| 372 | | # - If a link is old and "invalid", then it gets |
|---|
| 373 | | # ignored. |
|---|
| 374 | | # |
|---|
| 375 | | # - If a link is old and "valid", then it gets ignored. |
|---|
| 376 | | # |
|---|
| 377 | | # - If a link is new and "valid", then we check to see if |
|---|
| 378 | | # the referring URL's hostname[:port] and the link's |
|---|
| 379 | | # hostname[:port] match. If they match, then the link |
|---|
| 380 | | # is added to the 'relative_links_to_visit' hash. |
|---|
| 381 | | # Otherwise, the link is added to the 'links_to_visit' |
|---|
| 382 | | # hash. |
|---|
| 383 | | # |
|---|
| 384 | | # Inputs: HoneyClient::Agent::Driver::FF object, |
|---|
| 385 | | # referring URL, |
|---|
| 386 | | # array of LinkExtor link arrays |
|---|
| 387 | | # Outputs: HoneyClient::Agent::Driver::FF object |
|---|
| 388 | | |
|---|
| 389 | | sub _processLinks { |
|---|
| 390 | | |
|---|
| 391 | | # Get the object state. |
|---|
| 392 | | my $self = shift; |
|---|
| 393 | | |
|---|
| 394 | | # Get the referrer and the corresponding array of links. |
|---|
| 395 | | my ( $base_url, @links ) = @_; |
|---|
| 396 | | |
|---|
| 397 | | # Sanity check: Make sure we've been fed an object. |
|---|
| 398 | | unless ( ref($self) ) { |
|---|
| 399 | | Carp::croak "Error: Function must be called in reference to a " |
|---|
| 400 | | . __PACKAGE__ |
|---|
| 401 | | . "->new() object!\n"; |
|---|
| 402 | | } |
|---|
| 403 | | |
|---|
| 404 | | foreach my $link (@links) { |
|---|
| 405 | | |
|---|
| 406 | | #format of this array (pointed to by the $link ref) is |
|---|
| 407 | | # like ("img", "src", "$url") or ("a", "href", "$url") |
|---|
| 408 | | |
|---|
| 409 | | #skip some link types for now |
|---|
| 410 | | if ( @{$link}[0] eq "form" |
|---|
| 411 | | || @{$link}[0] eq "input" |
|---|
| 412 | | || @{$link}[0] eq "script" |
|---|
| 413 | | || @{$link}[0] eq "td" |
|---|
| 414 | | || @{$link}[0] eq "table" ) |
|---|
| 415 | | { |
|---|
| 416 | | next; |
|---|
| 417 | | } |
|---|
| 418 | | |
|---|
| 419 | | #Need to put these in a bucket so the firewall knows to allow access to them |
|---|
| 420 | | if ( @{$link}[0] eq "img" |
|---|
| 421 | | || @{$link}[0] eq "object" |
|---|
| 422 | | || @{$link}[0] eq "embed" ) |
|---|
| 423 | | { |
|---|
| 424 | | |
|---|
| 425 | | # lazy way, just absolutize right away incased it's a relative URL |
|---|
| 426 | | # this should catch any time a base href is actually used |
|---|
| 427 | | my $tmp_hn = |
|---|
| 428 | | _extractHostname( URI->new_abs( "@{$link}[2]", "$base_url" ) ); |
|---|
| 429 | | ## print "@{$link}[0] @{$link}[2] $tmp_hn\n"; |
|---|
| 430 | | my $tmp_port; |
|---|
| 431 | | |
|---|
| 432 | | #if it already has a port appended use that, otherwise set it to the default |
|---|
| 433 | | if ( $tmp_hn =~ /.*?:(.*)/ ) { |
|---|
| 434 | | $tmp_port = $1; |
|---|
| 435 | | } |
|---|
| 436 | | else { |
|---|
| 437 | | |
|---|
| 438 | | #Can you have image links to other protocols? |
|---|
| 439 | | #if(@{$link}[2] =~ /^ftp/) {$tmp_port = 21;} |
|---|
| 440 | | if ( @{$link}[2] =~ /^http/ ) { $tmp_port = 80; } |
|---|
| 441 | | if ( @{$link}[2] =~ /^https/ ) { $tmp_port = 443; } |
|---|
| 442 | | else { $tmp_port = 80; } |
|---|
| 443 | | } |
|---|
| 444 | | if(!exists($self->{hosts_to_resolve}{$tmp_hn})){ |
|---|
| 445 | | $self->{hosts_to_resolve}{$tmp_hn} = $tmp_port; |
|---|
| 446 | | print "adding $tmp_hn to hosts_to_resolve bucket in _processLinks\n"; |
|---|
| 447 | | } |
|---|
| 448 | | |
|---|
| 449 | | next; |
|---|
| 450 | | |
|---|
| 451 | | } |
|---|
| 452 | | |
|---|
| 453 | | if ( @{$link}[0] ne "a" |
|---|
| 454 | | && @{$link}[0] ne "link" |
|---|
| 455 | | && @{$link}[0] ne "area" |
|---|
| 456 | | && @{$link}[0] ne "iframe" ) |
|---|
| 457 | | { |
|---|
| 458 | | print "new type of link!: @{$link}\n"; |
|---|
| 459 | | $self->links_ignored->{ @{$link}[2] } = _getTimestamp(); |
|---|
| 460 | | next; |
|---|
| 461 | | } |
|---|
| 462 | | |
|---|
| 463 | | #strip anchors |
|---|
| 464 | | @{$link}[2] =~ s/\#.*//; |
|---|
| 465 | | |
|---|
| 466 | | #trying to avoid links already in the buckets |
|---|
| 467 | | if ( $self->_preexisting( @{$link}[2] ) |
|---|
| 468 | | || @{$link}[2] eq $self->next_link_to_visit |
|---|
| 469 | | || "@{$link}[2]/" eq $self->next_link_to_visit ) |
|---|
| 470 | | { |
|---|
| 471 | | next; |
|---|
| 472 | | } |
|---|
| 473 | | |
|---|
| 474 | | # print "@{$link}\n"; |
|---|
| 475 | | |
|---|
| 476 | | #starts with http(s):// |
|---|
| 477 | | # Check to make sure it's not an absolute link to a site we're already on |
|---|
| 478 | | # The second case of the if() is "the slashdot exception" ;) |
|---|
| 479 | | if ( @{$link}[2] =~ /^http[s]?:\/\//i || @{$link}[2] =~ /^\/\//i ) { |
|---|
| 480 | | my $tmp_url; |
|---|
| 481 | | if ( _extractHostname( @{$link}[2] ) eq |
|---|
| 482 | | _extractHostname( $self->{next_link_to_visit} ) ) |
|---|
| 483 | | { |
|---|
| 484 | | # URI->new_abs() will never return undef; however, |
|---|
| 485 | | # it could die if invalid parameters are supplied. |
|---|
| 486 | | $tmp_url = URI->new_abs( "@{$link}[2]", "$base_url" ); |
|---|
| 487 | | $self->relative_links_to_visit->{$tmp_url} = 1; |
|---|
| 488 | | next; |
|---|
| 489 | | } |
|---|
| 490 | | else { |
|---|
| 491 | | # URI->new_abs() will never return undef; however, |
|---|
| 492 | | # it could die if invalid parameters are supplied. |
|---|
| 493 | | $tmp_url = URI->new_abs( "@{$link}[2]", "$base_url" ); |
|---|
| 494 | | $self->links_to_visit->{$tmp_url} = 1; |
|---|
| 495 | | next; |
|---|
| 496 | | } |
|---|
| 497 | | } |
|---|
| 498 | | |
|---|
| 499 | | #ignore mailto and javascript links |
|---|
| 500 | | if ( @{$link}[2] =~ /^mailto/ |
|---|
| 501 | | || @{$link}[2] =~ /;$/ |
|---|
| 502 | | || @{$link}[2] =~ /^javascript:/ ) |
|---|
| 503 | | { |
|---|
| 504 | | $self->links_ignored->{ @{$link}[2] } = _getTimestamp(); |
|---|
| 505 | | next; |
|---|
| 506 | | } |
|---|
| 507 | | |
|---|
| 508 | | #ignore any other URI types for now since we've already got http(s) links |
|---|
| 509 | | if ( @{$link}[2] =~ /^\w*:\/\// ) { |
|---|
| 510 | | $self->links_ignored->{ @{$link}[2] } = _getTimestamp(); |
|---|
| 511 | | next; |
|---|
| 512 | | } |
|---|
| 513 | | |
|---|
| 514 | | #From here on out it should be a relative link, so first we absolutize it |
|---|
| 515 | | # URI->new_abs() will never return undef; however, |
|---|
| 516 | | # it could die if invalid parameters are supplied. |
|---|
| 517 | | my $uri = URI->new_abs( "@{$link}[2]", "$base_url" ); |
|---|
| 518 | | |
|---|
| 519 | | # Then we check to make sure the absolute link isn't a variant of the page |
|---|
| 520 | | # we got it from, and not already visited |
|---|
| 521 | | if ( $uri ne $base_url |
|---|
| 522 | | && $uri ne "$base_url/" |
|---|
| 523 | | && $uri ne $self->next_link_to_visit |
|---|
| 524 | | && "$uri/" ne $self->next_link_to_visit ) |
|---|
| 525 | | { |
|---|
| 526 | | |
|---|
| 527 | | if ( !$self->_preexisting($uri) ) { |
|---|
| 528 | | |
|---|
| 529 | | # print "2 adding (formerly) relative link $uri to the relative links to visit\n"; |
|---|
| 530 | | $self->relative_links_to_visit->{$uri} = 1; |
|---|
| 531 | | } |
|---|
| 532 | | } |
|---|
| 533 | | |
|---|
| 534 | | } |
|---|
| 535 | | |
|---|
| 536 | | # Return the modified object state. |
|---|
| 537 | | return $self; |
|---|
| 538 | | } |
|---|
| 539 | | |
|---|
| 540 | | ############################################################################### |
|---|
| 541 | | |
|---|
| 542 | | # returns 1 if a URI exists somewhere in the buckets |
|---|
| 543 | | # otherwise returns 0 |
|---|
| 544 | | |
|---|
| 545 | | # All the commented out code is related to whether we should test for the |
|---|
| 546 | | # existence of a link that is the same as the current link except possibly |
|---|
| 547 | | # having or missing a trailing slash. |
|---|
| 548 | | # Currently that code is not used, and therefore it will treat |
|---|
| 549 | | # http://www.foo.com/bar |
|---|
| 550 | | # as a different link than |
|---|
| 551 | | # http://www.foo.com/bar/ |
|---|
| 552 | | # And therefore go to both link independantly |
|---|
| 553 | | # Comments on whether this should be the case or not are greatly appreciated |
|---|
| 554 | | |
|---|
| 555 | | sub _preexisting { |
|---|
| 556 | | my $self = shift; |
|---|
| 557 | | my $uri = shift; |
|---|
| 558 | | |
|---|
| 559 | | # if($uri !~ /\/$/){ |
|---|
| 560 | | #try as is only (currently...and with trailing slash maybe later) |
|---|
| 561 | | if ( |
|---|
| 562 | | exists( $self->links_to_visit->{$uri} ) |
|---|
| 563 | | || |
|---|
| 564 | | |
|---|
| 565 | | # $self->links_to_visit->{"$uri/"} || |
|---|
| 566 | | exists( $self->relative_links_to_visit->{$uri} ) || |
|---|
| 567 | | |
|---|
| 568 | | # $self->relative_links_to_visit->{"$uri/"} || |
|---|
| 569 | | exists( $self->links_ignored->{$uri} ) || |
|---|
| 570 | | |
|---|
| 571 | | # $self->links_ignored->{"$uri/"} || |
|---|
| 572 | | exists( $self->links_visited->{$uri} ) |
|---|
| 573 | | ) |
|---|
| 574 | | { |
|---|
| 575 | | |
|---|
| 576 | | # $self->links_visited->{"$uri/"} ){ |
|---|
| 577 | | return 1; |
|---|
| 578 | | } |
|---|
| 579 | | |
|---|
| 580 | | # } |
|---|
| 581 | | # else{ |
|---|
| 582 | | #try without a trailing slash |
|---|
| 583 | | # $uri =~ s/\/$//; |
|---|
| 584 | | # if( $self->links_to_visit->{$uri} || |
|---|
| 585 | | # $self->relative_links_to_visit->{$uri} || |
|---|
| 586 | | # $self->links_ignored->{$uri} || |
|---|
| 587 | | # $self->links_visited->{$uri} ){ |
|---|
| 588 | | # return 1; |
|---|
| 589 | | # } |
|---|
| 590 | | # } |
|---|
| 591 | | |
|---|
| 592 | | return 0; |
|---|
| 593 | | } |
|---|
| 594 | | |
|---|
| 595 | | ############################################################################### |
|---|
| 596 | | # Public Methods Implemented # |
|---|
| 597 | | ############################################################################### |
|---|
| 598 | | |
|---|
| 599 | | =pod |
|---|
| 600 | | |
|---|
| 601 | | =head1 METHODS IMPLEMENTED |
|---|
| 602 | | |
|---|
| 603 | | The following functions have been implemented by the FF driver. Many |
|---|
| 604 | | of these methods were implementations of the parent Driver interface. |
|---|
| 605 | | |
|---|
| 606 | | As such, the following code descriptions pertain to this particular |
|---|
| 607 | | Driver implementation. For further information about the generic |
|---|
| 608 | | Driver interface, see the L<HoneyClient::Agent::Driver> documentation. |
|---|
| 609 | | |
|---|
| 610 | | =head2 HoneyClient::Agent::Driver::FF->new($param => $value, ...) |
|---|
| 611 | | |
|---|
| 612 | | =over 4 |
|---|
| 613 | | |
|---|
| 614 | | Creates a new FF driver object, which contains a hashtable |
|---|
| 615 | | containing any of the supplied "param => value" arguments. |
|---|
| 616 | | |
|---|
| 617 | | I<Inputs>: |
|---|
| 618 | | B<$param> is an optional parameter variable. |
|---|
| 619 | | B<$value> is $param's corresponding value. |
|---|
| 620 | | |
|---|
| 621 | | Note: If any $param(s) are supplied, then an equal number of |
|---|
| 622 | | corresponding $value(s) B<must> also be specified. |
|---|
| 623 | | |
|---|
| 624 | | I<Output>: The instantiated FF driver B<$object>, fully initialized. |
|---|
| 625 | | |
|---|
| 626 | | =back |
|---|
| 627 | | |
|---|
| 628 | | =begin testing |
|---|
| 629 | | |
|---|
| 630 | | # XXX: Add this. |
|---|
| | 238 | |
|---|
| | 239 | ####################################################################### |
|---|
| | 240 | |
|---|