Changeset 41

Show
Ignore:
Timestamp:
11/29/06 11:18:10 (2 years ago)
Author:
kindlund
Message:

added initial link scoring changes, from internal SVN repository

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • honeyclient/branches/exp/stephenson-link_scoring/lib/HoneyClient/Agent/Driver/Browser.pm

    r13 r41  
    561561# When given a hashtable reference, this function will extract a valid key 
    562562# from the hashtable and delete the (key, value) pair from the  
    563 # hashtable. 
    564 
    565 # Note: There is no guaranteed order about how this function picks 
    566 # keys from the hashtable. 
     563# hashtable.  The link with the highest score is returned. 
     564
     565#  
    567566# 
    568567# Inputs: hashref 
     
    573572    my $hash = shift; 
    574573 
    575     # Get a new key
    576     my @keys = keys(%{$hash})
    577     my $key = pop(@keys)
     574    # Get the highest score
     575    my @array = sort {$$hash{$b} <=> $$hash{$a}} keys %{$hash}
     576    my $topkey = $array[0]
    578577     
    579578    # Delete the key from the hashtable. 
    580     if (defined($key)) { 
    581         delete $hash->{$key}; 
     579    if (defined($topkey)) { 
     580        delete $hash->{$topkey}; 
    582581    } 
    583582 
    584583    # Return the key found. 
    585     return $key; 
    586 
    587  
    588 # This is the abstract function which actually fetches the web content using 
    589 # a specific browser implementation.  Must be implemented by each browser class. 
    590  
    591 sub getContent { 
    592  
    593 
    594  
    595 # Helper function which parses the HTTP::Response from LWP::UserAgent 
    596 # and returns an array of the links contained in the response 
    597 
    598 # Inputs: HTTP::Response object 
    599 # Outputs: Array containing all href links within the response 
    600  
    601 sub _getAllLinks { 
    602      
    603     my $response = shift; 
    604     my $hostname = shift; 
    605     my @links = (); 
    606     my $thislink; 
    607      
    608     my $html = $response->content; 
    609      
    610     while( $html =~ m/<A HREF=\"(.*?)\"/gi ) { 
    611         $thislink = $1; 
    612  
    613         # For relative links, prepend the hostname 
    614         # TODO:  Probably shouldn't assume the HTTP protocol... 
    615         if ($thislink =~ /^\//) { 
    616             $thislink = "http://" . $hostname . $thislink; 
    617         } 
    618          
    619         push @links, $thislink; 
    620     } 
    621  
    622     #Return the list of absolute links 
    623     return @links; 
     584    return $topkey; 
    624585} 
    625586 
     
    655616# Helper function, designed to process all links found at a 
    656617# given URL, once the browser has been driven to that URL 
    657 # and has collected all corresponding links. 
     618# and has collected all corresponding links.  The links are 
     619# sorted in increasing order as determined by their score. 
    658620# 
    659621# When supplied with the array of URL strings, 
     
    682644#   hash. 
    683645# 
    684 # Inputs: HoneyClient::Agent::Driver::IE object, 
     646# Inputs: HoneyClient::Agent::Driver::Browser object, 
    685647#         hostname[:port] of referring URL, 
    686 #         array of URL strings 
    687 # Outputs: HoneyClient::Agent::Driver::IE object 
     648#         hash of URL strings and scores, the url is the key 
     649# Outputs: HoneyClient::Agent::Driver::Browser object 
    688650sub _processLinks { 
    689651 
     
    691653    my $self = shift; 
    692654 
    693     # Get the referrer and the corresponding array of links. 
    694     my ($referrer, @links) = @_; 
     655    # Get the referrer and the corresponding arrays of links and scores. 
     656    my ($referrer, %links) = @_; 
    695657     
    696     foreach my $url (@links) { 
     658    foreach my $url (keys %links) { 
     659        my $score = $links{$url}; 
    697660 
    698661        # Skip over any undefined links. 
     
    722685            # Then add the URL to the 'relative_links_to_visit' hashtable, 
    723686            # since we're visiting links that share the same hostname. 
    724             $self->relative_links_to_visit->{$url} = 1
     687            $self->relative_links_to_visit->{$url} = $score
    725688        } else { 
    726689            # Else, add the URL to the 'links_to_visit' hashtable, 
    727690            # since we're visiting links that do NOT share the same hostname. 
    728             $self->links_to_visit->{$url} = 1
     691            $self->links_to_visit->{$url} = $score
    729692        } 
    730693    } 
    731      
     694         
    732695    # Return the modified object state. 
    733696    return $self; 
     
    986949     
    987950    # Use LWP::UserAgent to get the desired $args{'url'} and associated content 
    988     my @links = undef;  
    989  
    990951    # TODO: Analyze all the options LWP::UserAgent provides, in case we've  
    991952    # missed something useful. 
     
    1006967    $ua->max_size(1*1024*1024); # Don't get values larger than 1MB for testing 
    1007968    $ua->timeout($timeout); 
    1008  
    1009     # XXX: This is old code; delete eventually. 
    1010 #   my $response = $ua->get($args{'url'}); 
    1011  
    1012     # Get the links 
    1013 #    @links = _getAllLinks($response, _extractHostname($args{'url'})); 
    1014  
    1015     # Make the parser.  Unfortunately, we don't know the base yet 
    1016     # (it might be diffent from $url) 
    1017     #my $parser = HTML::LinkExtor->new(\&extractLinks); 
    1018     my $parser = HTML::LinkExtor->new(); 
    1019  
     969     
    1020970    my $response = $ua->request( 
    1021971                        HTTP::Request->new( 
     
    1025975                                'Accept' => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 
    1026976                            ), 
    1027                         ), 
    1028                         sub { $parser->parse($_[0]) }, 
     977                        ) 
    1029978    ); 
    1030      
    1031     # Extract only the <a href ...> links, for now. 
    1032     # TODO: Handle other link types. 
    1033     foreach my $entry ($parser->links) { 
    1034         if ($entry->[0] eq 'a') { 
    1035             push(@links, $entry->[2]); 
    1036         } 
    1037     } 
    1038  
    1039     # Expand all relative links found to absolute ones. 
     979 
     980    # Get the base url from the response 
    1040981    my $base = $response->base; 
    1041     @links = map { $_ = url($_, $base)->abs; } @links; 
    1042  
     982    my $content = $response->content; 
     983    my %scored_links; 
     984     
    1043985    # Get the current time. 
    1044986    my $timestamp = _getTimestamp(); 
    1045  
     987     
     988    # Score the new links based on their surrounding HTML context 
     989    # If %scored_links is emtpy upon return, there are no links 
     990    # and we will not perform any of the following code 
     991    if ($content) { 
     992        %scored_links = _scoreLinks($base, $content); 
     993    } 
     994     
    1046995    # Check to see if the request timed out. 
    1047996    # TODO: Need better error detection. 
    1048     if (!@links) { 
     997    if (!%scored_links) { 
    1049998        $self->links_timed_out->{$args{'url'}} = $timestamp; 
    1050999 
     
    10591008        $self->links_visited->{$args{'url'}} = $timestamp; 
    10601009 
    1061         # Get all links found on this page
     1010        # Add all links found on this page to our sorted queues
    10621011        # This function modifies the $self object internally and its 
    10631012        # returned content does not need to be checked. 
    1064         $self->_processLinks(_extractHostname($args{'url'}), @links); 
     1013        $self->_processLinks(_extractHostname($args{'url'}), %scored_links); 
    10651014    } 
    10661015 
     
    12681217 
    12691218    return $nextSite; 
     1219} 
     1220 
     1221=pod 
     1222 
     1223=head2 _scoreLinks() 
     1224 
     1225=over 4 
     1226 
     1227The _scoreLinks helper function takes a scalar which is the base url for 
     1228the web page and a scalar which holds the content of the page (HTML) 
     1229 
     1230This function will calculate the "popularity" scores of the links. 
     1231The function returns a hash which is keyed on the _absolute_ url 
     1232and contains the value of the score. 
     1233 
     1234I<Output>: The populated %links hash if the page is not empty. An empty 
     1235hash otherwise. 
     1236 
     1237For example, if your  raw HTML content is $content, and the base url is 
     1238$base you would use the following call to this function. 
     1239 
     1240my %links = _scoreLinks($base, $content); 
     1241if (!%links) { 
     1242    # Do error checking here 
     1243  } 
     1244 
     1245=back 
     1246 
     1247=begin testing 
     1248 
     1249# XXX: Test this. 
     12501; 
     1251 
     1252=end testing 
     1253 
     1254=cut 
     1255 
     1256sub _scoreLinks { 
     1257    my ($base, $content) = @_; 
     1258    my %links = (); 
     1259    my $url; 
     1260    open(FILEH,">>scoring.txt") || die("Cannot Open File"); 
     1261     
     1262    if (!$content) { 
     1263        return %links; 
     1264    } 
     1265     
     1266    # Begin to scour the HTML content for <a> tags   
     1267    while ($content =~ m{<a\b([^>]+)>(.*?)</a>}ig) { 
     1268        my $attr = $1; 
     1269        my $text = $2; 
     1270        my $score = 0; 
     1271     
     1272        if ($attr =~ m{ 
     1273                        \b HREF 
     1274                        \s* = \s* 
     1275                        (?: 
     1276                          "([^"]*)" 
     1277                          | 
     1278                          '([^']*)' 
     1279                          | 
     1280                          {[^'">\s]+} 
     1281                        ) 
     1282                     }xi) 
     1283         { 
     1284            $url = $+; 
     1285             
     1286            # We have to make this an absolute url (if it's not) 
     1287            # before using it as a key in the %links hash 
     1288            $url = url($url, $base)->abs; 
     1289             
     1290            # The link must be an HREF and be a http(s) link     
     1291            if ($url =~ /^http/i) { 
     1292                # Image bonus 
     1293                if ($text =~ /img/i) { 
     1294                    $score += 50; 
     1295                    print FILEH "Image bonus!\n";  
     1296                } 
     1297                # Score image size 
     1298                if ($text =~ /\b WIDTH\s*=\s*.(\d+)/xi) 
     1299                { 
     1300                    my $width = $1; 
     1301                    $score += int($width/10); 
     1302                    print FILEH "Image area bonus! $width\n";  
     1303                } 
     1304                if ($text =~ /\b HEIGHT\s*=\s*.(\d+)/xi) 
     1305                { 
     1306                    my $height = $1; 
     1307                    $score += int($height/10); 
     1308                    print FILEH "Image area bonus! $height\n";  
     1309                } 
     1310                # Score length of link text 
     1311                if (length($text) > 15) { 
     1312                    $score += 4; 
     1313                    print FILEH "Long link text bonus!\n"; 
     1314                } 
     1315                # Good word bonus 
     1316                if ($text =~ /(news|new|big|latest|main|update|sell|free|buy)/i) { 
     1317                    $score += 6; 
     1318                    print FILEH "Good word bonus!\n"; 
     1319                } 
     1320                # Bad word penalty 
     1321                if ($text =~ /(archive|privacy|legal|disclaim|about|contact|copyright|jobs|careers)/i) { 
     1322                    $score -= 6; 
     1323                    print FILEH "Bad word penalty!\n"; 
     1324                } 
     1325     
     1326                print FILEH "The attributes for $url are $attr\n" unless (!$attr); 
     1327                print FILEH "The text for $url is $text\n" unless (!$text); 
     1328                print FILEH "It scored $score\n"; 
     1329                 
     1330                $links{$url} = $score; 
     1331                $url = undef; 
     1332                print FILEH "\n"; 
     1333            } 
     1334        } 
     1335    } 
     1336     
     1337    close(FILEH);        
     1338    return %links; 
    12701339} 
    12711340 
  • honeyclient/branches/exp/stephenson-link_scoring/lib/HoneyClient/Agent/Driver/Browser/FF.pm

    r13 r41  
    3434use strict; 
    3535use warnings; 
     36use Config; 
    3637use Carp (); 
    37 use Config; 
    38 use Win32::Job;          #For starting browser 
    39 use HTML::LinkExtor;     #For extracting links from HTML 
    40 use HTML::HeadParser;    #For extracting the meta w/ URL that LinkExtor misses 
    41 use LWP::UserAgent;      #Perl-based "browser" 
    42 use URI;                 #For absolutizing relative URLs 
    43 #use Data::Dumper;       #For Debugging 
    4438 
    4539# Traps signals, allowing END: blocks to perform cleanup. 
    4640use sigtrap qw(die untrapped normal-signals error-signals); 
    4741 
    48 ############################################################################### 
    49 # Module Initialization                                                      
    50 ############################################################################### 
     42####################################################################### 
     43# Module Initialization                                               
     44####################################################################### 
    5145 
    5246BEGIN { 
     
    6054 
    6155    # Define inherited modules. 
    62     use HoneyClient::Agent::Driver
    63  
    64     @ISA = qw(Exporter HoneyClient::Agent::Driver); 
     56    use HoneyClient::Agent::Driver::Browser
     57 
     58    @ISA = qw(Exporter HoneyClient::Agent::Driver::Browser); 
    6559 
    6660    # Symbols to export on request 
     
    7569    # Do not simply export all your public functions/methods/constants. 
    7670 
    77     # This allows declaration use HoneyClient::Agent::Driver::FF ':all'; 
     71    # This allows declaration use HoneyClient::Agent::Driver::Browser::IE ':all'; 
    7872    # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK 
    7973    # will save memory. 
     
    8882    @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); 
    8983 
     84# XXX: Fix this! 
     85# Check to make sure our OS is Windows-based. 
     86#if ($Config{osname} !~ /^MSWin32$/) { 
     87#    Carp::croak "Error: " . __PACKAGE__ . " will only run on Win32 platforms!\n"; 
     88#} 
     89 
    9090    $SIG{PIPE} = 'IGNORE';    # Do not exit on broken pipes. 
    9191} 
    9292our ( @EXPORT_OK, $VERSION ); 
    9393 
    94 ############################################################################### 
     94#TODO: Rewrite the test module 
     95 
     96=pod 
     97 
     98=begin testing 
     99 
     100=end testing 
     101 
     102=cut 
     103 
     104####################################################################### 
     105 
     106#TODO: Remove any of these use statements that aren't needed 
    95107 
    96108# Include the Global Configuration Processing Library 
     
    100112use DateTime::HiRes; 
    101113 
     114# Use fractional second sleeping. 
     115# TODO: Need unit testing. 
     116use Time::HiRes qw(sleep); 
     117 
    102118# Use Storable Library 
    103119use Storable qw(dclone); 
    104120 
    105 my %PARAMS = ( 
    106  
    107     # This is a hashtable of fully qualified URLs 
    108     # to visit by the browser.  Specifically, the 'key' is 
    109     # the absolute URL and the 'value' is always 1. 
    110     links_to_visit => {}, 
    111  
    112     # This is a hashtable of fully qualified URLs that the 
    113     # browser has already visited.  Specifically, the 
    114     # 'key' is the absolute URL and the 'value' is a string 
    115     # representing the date and time of when the link was visited. 
    116     # 
    117     # Note: See _getTimestamp() for the corresponding date/time 
    118     # format. 
    119     links_visited => {}, 
    120  
    121     # This is a hashtable of URLs that the browser has found 
    122     # during its traversal process, but the browser could not 
    123     # access the link. 
    124     # 
    125     # Links could be added to this list if access requires any type of 
    126     # authentication, or if the link points to a non-HTTP or HTTPS 
    127     # resource (i.e., "javascript:doNetDetect()"). 
    128     # 
    129     # The 'key' is the absolute URL and the 'value' is a string 
    130     # representing the date and time of when the link was visited. 
    131     # 
    132     # Note: See _getTimestamp() for the corresponding date/time 
    133     # format. 
    134     links_ignored => {}, 
    135  
    136     # This is a hashtable of fully qualified URLs 
    137     # that all share a common *hostname*.  This hashtable should be 
    138     # initially empty.  As the driver extracts and removes new URLs 
    139     # off the 'links_to_visit' hashtable, driving the browser to each URL, 
    140     # any *relative* links found are added into this hashtable; any 
    141     # *external* links found are added back into the 'links_to_visit' 
    142     # hashtable. 
    143     # 
    144     # When navigating to the next link, this hashtable is exhausted prior 
    145     # to the main 'links_to_visit' hashtable.  This allows a 
    146     # browser to navigate to all links hosted on the same server, prior 
    147     # to contacting a different server. 
    148     # 
    149     # Specifically, the 'key' is the absolute URL and the 'value' 
    150     # is always 1. 
    151     relative_links_to_visit => {}, 
    152  
    153     # This is a scalar that contains the next URL to visit. 
    154     # It is updated dynamically, any time _dep_getNextLink() is called. 
    155     # When the browser is ready to drive to the next link, 
    156     # 'next_link_to_visit' is checked.  If that value is undef, then 
    157     # the 'relative_links_to_visit' hashtable is checked next. 
    158     # If that hashtable is empty, then finally the 'links_to_visit' 
    159     # hashtable is checked. 
    160     next_link_to_visit => undef, 
    161  
    162     # This is a hashtable of URLs that the browser has found 
    163     # during its traversal process, but the browser could not 
    164     # access the resource due to the operation timing out. 
    165     # 
    166     # The 'key' is the absolute URL and the 'value' is a string 
    167     # representing the date and time of when the link was visited. 
    168     # 
    169     # Note: See _getTimestamp() for the corresponding date/time 
    170     # format. 
    171     links_timed_out => {}, 
    172  
    173     # Absolute path to firefox executable. Needs to be in Windows 
    174     # path format not Cygwin, because it's used in Win32::Job 
    175     ff_exec => getVar(name => "ff_exec"), 
    176  
    177     # Hashtable used to hold hostnames and ports for any other sites which may 
    178     # need to be looked up (for instance external image URLs) 
    179     # Entries are in the form 
    180     # {hostname => port, ...} 
    181     hosts_to_resolve => {}, 
    182  
    183     # A switch which can be turned off (set to 0) to decide NOT to drive ff 
    184     # to a particularly instance of a URL. For instance because the URL in 
    185     # question actually redirects somewhere else and you don't want to get 
    186     # a false positive on the firewall. 
    187     drive_ff => 0, 
    188  
    189     # Because the FF.pm drive() function actually uses the perl LWP::UserAgent 
    190     # to actually process a page and find links, there are actually two things 
    191     # going on inside this function. Because of the interaction between next() 
    192     # and drive() it is therefore necessary to pop back out of the drive() 
    193     # after it's got the links but before it drives ff, so that the data can 
    194     # be used by next() and given back to the firewall before it drives ff. 
    195     # This toggle lets every other instance of drive() either use the 
    196     # LWP::UserAgent or Firefox. 
    197     flip_flop => 0, 
    198  
    199     # An integer, representing how many relative links the browser 
    200     # should continue to drive to, before moving onto another 
    201     # website.  If negative, then the browser will exhaust all possible 
    202     # relative links, before moving on.  (This internal variable should 
    203     # never be modified externally.) 
    204     max_relative_links_to_visit => getVar(name => "max_relative_links_to_visit"), 
    205  
    206     # NOTE: Don't delete me unless you delete max_relative_links_to_visit 
    207     # An integer, representing the maximum number of relative links that 
    208     # the browser should visit, before moving onto another website.  If 
    209     # negative, then the browser will exhaust all possible relative links 
    210     # found, before moving on.  This functionality is best effort; it's 
    211     # possible for the browser to visit new links on previously visited 
    212     # websites. 
    213     _remaining_number_of_relative_links_to_visit => getVar(name => "max_relative_links_to_visit"), 
    214      
    215     #COPY DARIEN'S COMMENTS! YAY! (or not, boo!)) 
    216     _next_connections => {}, 
    217  
    218     #This is here so that it doesn't have to do a getVar every time it wants to run  
    219     # the LWP stuff. Can maybe be moved into the big datastructure. 
    220     http_proxy => getVar(name => "http_proxy"),  
    221  
    222 ); 
    223  
    224 ############################################################################### 
    225 # Private Methods Implemented                                                 # 
    226 ############################################################################### 
    227  
    228 # Helper function designed to retrieve the next link for the browser 
    229 # to navigate to. 
    230 
    231 # Note: Calling this function will implicitly remove the next link from 
    232 #       any and all applicable hashtables/scalars. 
    233 
    234 # When getting the next link, 'next_link_to_visit' is checked first. 
    235 # If that value is undef, then the 'relative_links_to_visit' hashtable 
    236 # is checked next.  If that hashtable is empty, then finally the 
    237 # 'links_to_visit' hashtable is checked. 
    238 
    239 # Inputs: HoneyClient::Agent::Driver::FF object 
    240 # Outputs: link, or undef if all applicable scalars/hashtables are empty 
    241 sub _getNextLink { 
    242  
    243     # Get the object state. 
    244     my $self = shift; 
    245  
    246     # Set the link to find as undef, initially. 
    247     my $link = undef; 
    248  
    249     while ( !defined($link) or ($link eq "") ) { 
    250  
    251         # Try getting the next link from the next link 
    252         # scalar. 
    253         $link = $self->next_link_to_visit; 
    254         $self->{next_link_to_visit} = undef; 
    255  
    256         # If the next link scalar is empty, try 
    257         # getting a link from the relative hashtable. 
    258         unless ( defined($link) ) { 
    259             $link = _pop( $self->relative_links_to_visit ); 
    260         } 
    261  
    262         # If the relative hashtable is empty, try getting one 
    263         # from the external hashtable. 
    264         unless ( defined($link) ) { 
    265             $link = _pop( $self->links_to_visit ); 
    266         } 
    267  
    268         # If all hashtables/scalars were empty, immediately return an 
    269         # undef value. 
    270         unless ( defined($link) ) { 
    271             return $link; 
    272         } 
    273  
    274         # Now, make sure the link is valid, before we return 
    275         # it; if it's not valid, we simply move on to the next 
    276         # one in our hashtables. 
    277         #$link = $self->_validateLink($link); 
    278     } 
    279  
    280     # Return the next link found. 
    281     return $link; 
     121# Use threads Library 
     122# TODO: Need unit testing. 
     123use threads; 
     124 
     125# TODO: Need unit testing. 
     126use threads::shared; 
     127 
     128# TODO: Need unit testing. 
     129use HoneyClient::Util::SOAP qw(getClientHandle); 
     130 
     131# TODO: Need unit testing. 
     132use Win32::Job; 
     133 
     134# TODO: clean this up. 
     135#my %PARAMS = ( 
     136#); 
     137 
     138####################################################################### 
     139# Private Methods Implemented                                         # 
     140####################################################################### 
     141 
     142#sub new { 
     143#    
     144#    # - This function takes in an optional hashtable, 
     145#    #   that contains various key => 'value' configuration 
     146#    #   parameters. 
     147#    # 
     148#    # - For each parameter given, it overwrites any corresponding 
     149#    #   parameters specified within the default hashtable, %PARAMS, 
     150#    #   with custom entries that were given as parameters. 
     151#    # 
     152#    # - Finally, it returns a blessed instance of the 
     153#    #   merged hashtable, as an 'object'. 
     154
     155#    # Get the class name. 
     156#    my $self = shift; 
     157
     158#    # Get the rest of the arguments, as a hashtable. 
     159#    # Hash-based arguments are used, since HoneyClient::Util::SOAP is unable to handle 
     160#    # hash references directly.  Thus, flat hashtables are used throughout the code 
     161#    # for consistency. 
     162#    my %args = @_; 
     163
     164#    # Check to see if the class name is inherited or defined. 
     165#    my $class = ref($self) || $self; 
     166
     167#    # Initialize default parameters. 
     168#    my %params = %{dclone(\%PARAMS)}; 
     169#    $self = $class->SUPER::new(); 
     170#    @{$self}{keys %params} = values %params; 
     171
     172#    # Now, overwrite any default parameters that were redefined 
     173#    # in the supplied arguments. 
     174#    @{$self}{keys %args} = values %args; 
     175
     176#    # Now, assign our object the appropriate namespace. 
     177#    bless $self, $class; 
     178
     179#    # Finally, return the blessed object. 
     180#    return $self; 
     181#} 
     182 
     183sub drive { 
     184 
     185    # Extract arguments. 
     186    my ($self, %args) = @_; 
     187 
     188    # Sanity check: Make sure we've been fed an object. 
     189    unless (ref($self)) { 
     190        Carp::croak "Error: Function must be called in reference to a " . 
     191                    __PACKAGE__ . "->new() object!\n"; 
     192    } 
     193 
     194    # Sanity check, don't get the next link, if 
     195    # we've been fed a url. 
     196    my $argsExist = scalar(%args); 
     197    if (!$argsExist || 
     198        !exists($args{'url'}) || 
     199        !defined($args{'url'})) { 
     200        # Get the next URL from our hashtables. 
     201        $args{'url'} = $self->_getNextLink(); 
     202    } 
     203 
     204    # Drive the generic browser before opening with IE 
     205    $self = $self->SUPER::drive(%args); 
     206 
     207    # Sanity check: Make sure our next URL is defined. 
     208    unless (defined($args{'url'})) { 
     209        Carp::croak "Error: Unable to drive browser - 'links_to_visit' " . 
     210                    "hashtable is empty!\n"; 
     211    } 
     212 
     213    # Indicates how long we wait for each drive operation to complete, 
     214    # before registering attempt as a failure. 
     215    my $timeout : shared = $self->timeout(); 
     216 
     217    # Create a new Job. 
     218    my $job = Win32::Job->new(); 
     219 
     220    # Sanity check. 
     221    if (!defined($job)) { 
     222        Carp::croak "Error: Unable to spawn new job - " . $^E . ".\n"; 
     223    } 
     224 
     225    # Spawn the job. 
     226    $job->spawn(undef, "\"C:\\Program Files\\Mozilla Firefox\\firefox.exe\"" . $args{'url'}); 
     227 
     228    # TODO: check to see if spawn fails. 
     229 
     230    # Run the job. 
     231    $job->run($timeout); 
     232 
     233    # TODO: check to see if run fails. 
     234 
     235    # Return the modified object state. 
     236    return $self; 
    282237} 
    283 ############################################################################### 
    284  
    285 # Helper function designed to get a current timestamp from 
    286 # the system OS. 
    287 
    288 # Note: This timestamp is in ISO 8601 format. 
    289 
    290 # Inputs: none 
    291 # Outputs: timestamp 
    292 sub _getTimestamp { 
    293     my $dt = DateTime::HiRes->now(); 
    294     return $dt . "." . $dt->nanosecond(); 
    295 
    296 ############################################################################### 
    297 # Helper function designed to "pop" a key off a given hashtable. 
    298 # When given a hashtable reference, this function will extract a valid key 
    299 # from the hashtable and delete the (key, value) pair from the 
    300 # hashtable. 
    301 
    302 # Note: There is no guaranteed order about how this function picks 
    303 # keys from the hashtable. 
    304 
    305 # Inputs: hashref 
    306 # Outputs: valid key, or undef if the hash is empty 
    307 sub _pop { 
    308  
    309     # Get supplied hash reference. 
    310     my $hash = shift; 
    311  
    312     # Get a new key. 
    313     my @keys = keys( %{$hash} ); 
    314     my $key  = pop(@keys); 
    315  
    316     # Delete the key from the hashtable. 
    317     if ( defined($key) ) { 
    318         delete $hash->{$key}; 
    319     } 
    320 ##  print "in pop " . Dumper($hash); 
    321  
    322     # Return the key found. 
    323     return $key; 
    324 
    325  
    326 ############################################################################### 
    327  
    328 # Helper function, designed to extract the hostname 
    329 # (and, if it exists, the port number) from a given 
    330 # URL. 
    331 
    332 # For example, if "http://hostname.com:80/path/index.html" 
    333 # is given, then "hostname:80" would be returned. 
    334 
    335 # Inputs: URL 
    336 # Outputs: hostname[:port] 
    337 sub _extractHostname { 
    338  
    339     # Sanity check. 
    340     my $arg = shift(); 
    341  
    342     if (!defined($arg)) { 
    343         return ""; 
    344     } 
    345  
    346     # Get the URL supplied. 
    347     my $url = $arg; 
    348  
    349     # Note: The '?' chars make a critical difference 
    350     # in how this regex operates. 
    351     $url =~ s/.*?\/\/(.*?)\/.*/$1/; 
    352  
    353     # Return the extracted hostname. 
    354     return $url; 
    355 
    356 ############################################################################### 
    357  
    358 # Helper function, designed to process all links found at a 
    359 # given URL, once the browser has been driven to that URL 
    360 # and has collected all corresponding links. 
    361 
    362 # When supplied with the array of URLs 
    363 # this function will categorize them 
    364 # as follows: 
    365 
    366 # "New" links are those we've never driven the browser to. 
    367 # "Old" links are those we've driven the browser to before. 
    368 
    369 # - If a link is new and "invalid", then it gets added to 
    370 #   the 'links_ignored' hashtable. 
    371 
    372 # - If a link is old and "invalid", then it gets 
    373 #   ignored. 
    374 
    375 # - If a link is old and "valid", then it gets ignored. 
    376 
    377 # - If a link is new and "valid", then we check to see if 
    378 #   the referring URL's hostname[:port] and the link's 
    379 #   hostname[:port] match.  If they match, then the link 
    380 #   is added to the 'relative_links_to_visit' hash. 
    381 #   Otherwise, the link is added to the 'links_to_visit' 
    382 #   hash. 
    383 
    384 # Inputs: HoneyClient::Agent::Driver::FF object, 
    385 #         referring URL, 
    386 #         array of LinkExtor link arrays 
    387 # Outputs: HoneyClient::Agent::Driver::FF object 
    388  
    389 sub _processLinks { 
    390  
    391     # Get the object state. 
    392     my $self = shift; 
    393  
    394     # Get the referrer and the corresponding array of links. 
    395     my ( $base_url, @links ) = @_; 
    396  
    397     # Sanity check: Make sure we've been fed an object. 
    398     unless ( ref($self) ) { 
    399         Carp::croak "Error: Function must be called in reference to a " 
    400           . __PACKAGE__ 
    401           . "->new() object!\n"; 
    402     } 
    403  
    404     foreach my $link (@links) { 
    405  
    406         #format of this array (pointed to by the $link ref) is 
    407         # like ("img", "src", "$url") or ("a", "href", "$url") 
    408  
    409         #skip some link types for now 
    410         if (   @{$link}[0] eq "form" 
    411             || @{$link}[0] eq "input" 
    412             || @{$link}[0] eq "script" 
    413             || @{$link}[0] eq "td" 
    414             || @{$link}[0] eq "table" ) 
    415         { 
    416             next; 
    417         } 
    418  
    419     #Need to put these in a bucket so the firewall knows to allow access to them 
    420         if (   @{$link}[0] eq "img" 
    421             || @{$link}[0] eq "object" 
    422             || @{$link}[0] eq "embed" ) 
    423         { 
    424  
    425             # lazy way, just absolutize right away incased it's a relative URL 
    426             # this should catch any time a base href is actually used 
    427             my $tmp_hn = 
    428               _extractHostname( URI->new_abs( "@{$link}[2]", "$base_url" ) ); 
    429 ##          print "@{$link}[0] @{$link}[2] $tmp_hn\n"; 
    430             my $tmp_port; 
    431  
    432     #if it already has a port appended use that, otherwise set it to the default 
    433             if ( $tmp_hn =~ /.*?:(.*)/ ) { 
    434                 $tmp_port = $1; 
    435             } 
    436             else { 
    437  
    438                 #Can you have image links to other protocols? 
    439                 #if(@{$link}[2] =~ /^ftp/) {$tmp_port = 21;} 
    440                 if ( @{$link}[2] =~ /^http/ )  { $tmp_port = 80; } 
    441                 if ( @{$link}[2] =~ /^https/ ) { $tmp_port = 443; } 
    442                 else { $tmp_port = 80; } 
    443             } 
    444             if(!exists($self->{hosts_to_resolve}{$tmp_hn})){ 
    445                 $self->{hosts_to_resolve}{$tmp_hn} = $tmp_port; 
    446                 print "adding $tmp_hn to hosts_to_resolve bucket in _processLinks\n"; 
    447             } 
    448              
    449             next; 
    450  
    451         } 
    452  
    453         if (   @{$link}[0] ne "a" 
    454             && @{$link}[0] ne "link" 
    455             && @{$link}[0] ne "area" 
    456             && @{$link}[0] ne "iframe" ) 
    457         { 
    458             print "new type of link!: @{$link}\n"; 
    459             $self->links_ignored->{ @{$link}[2] } = _getTimestamp(); 
    460             next; 
    461         } 
    462  
    463         #strip anchors 
    464         @{$link}[2] =~ s/\#.*//; 
    465  
    466         #trying to avoid links already in the buckets 
    467         if (   $self->_preexisting( @{$link}[2] ) 
    468             || @{$link}[2]    eq $self->next_link_to_visit 
    469             || "@{$link}[2]/" eq $self->next_link_to_visit ) 
    470         { 
    471             next; 
    472         } 
    473  
    474         #       print "@{$link}\n"; 
    475  
    476        #starts with http(s):// 
    477        # Check to make sure it's not an absolute link to a site we're already on 
    478        # The second case of the if() is "the slashdot exception" ;) 
    479         if ( @{$link}[2] =~ /^http[s]?:\/\//i || @{$link}[2] =~ /^\/\//i ) { 
    480             my $tmp_url; 
    481             if ( _extractHostname( @{$link}[2] ) eq 
    482                 _extractHostname( $self->{next_link_to_visit} ) ) 
    483             { 
    484                 # URI->new_abs() will never return undef; however, 
    485                 # it could die if invalid parameters are supplied. 
    486                 $tmp_url = URI->new_abs( "@{$link}[2]", "$base_url" ); 
    487                 $self->relative_links_to_visit->{$tmp_url} = 1; 
    488                 next; 
    489             } 
    490             else { 
    491                 # URI->new_abs() will never return undef; however, 
    492                 # it could die if invalid parameters are supplied. 
    493                 $tmp_url = URI->new_abs( "@{$link}[2]", "$base_url" ); 
    494                 $self->links_to_visit->{$tmp_url} = 1; 
    495                 next; 
    496             } 
    497         } 
    498  
    499         #ignore mailto and javascript links 
    500         if (   @{$link}[2] =~ /^mailto/ 
    501             || @{$link}[2] =~ /;$/ 
    502             || @{$link}[2] =~ /^javascript:/ ) 
    503         { 
    504             $self->links_ignored->{ @{$link}[2] } = _getTimestamp(); 
    505             next; 
    506         } 
    507  
    508        #ignore any other URI types for now since we've already got http(s) links 
    509         if ( @{$link}[2] =~ /^\w*:\/\// ) { 
    510             $self->links_ignored->{ @{$link}[2] } = _getTimestamp(); 
    511             next; 
    512         } 
    513  
    514        #From here on out it should be a relative link, so first we absolutize it 
    515         # URI->new_abs() will never return undef; however, 
    516         # it could die if invalid parameters are supplied. 
    517         my $uri = URI->new_abs( "@{$link}[2]", "$base_url" ); 
    518  
    519       # Then we check to make sure the absolute link isn't a variant of the page 
    520       # we got it from, and not already visited 
    521         if (   $uri ne $base_url 
    522             && $uri    ne "$base_url/" 
    523             && $uri    ne $self->next_link_to_visit 
    524             && "$uri/" ne $self->next_link_to_visit ) 
    525         { 
    526  
    527             if ( !$self->_preexisting($uri) ) { 
    528  
    529 #               print "2 adding (formerly) relative link $uri to the relative links to visit\n"; 
    530                 $self->relative_links_to_visit->{$uri} = 1; 
    531             } 
    532         } 
    533  
    534     } 
    535  
    536     # Return the modified object state. 
    537     return $self; 
    538 
    539  
    540 ############################################################################### 
    541  
    542 # returns 1 if a URI exists somewhere in the buckets 
    543 # otherwise returns 0 
    544  
    545 # All the commented out code is related to whether we should test for the 
    546 # existence of a link that is the same as the current link except possibly 
    547 # having or missing a trailing slash. 
    548 # Currently that code is not used, and therefore it will treat 
    549 # http://www.foo.com/bar 
    550 # as a different link than 
    551 # http://www.foo.com/bar/ 
    552 # And therefore go to both link independantly 
    553 # Comments on whether this should be the case or not are greatly appreciated 
    554  
    555 sub _preexisting { 
    556     my $self = shift; 
    557     my $uri  = shift; 
    558  
    559     #       if($uri !~ /\/$/){ 
    560     #try as is only (currently...and with trailing slash maybe later) 
    561     if ( 
    562         exists( $self->links_to_visit->{$uri} ) 
    563         || 
    564  
    565         #                   $self->links_to_visit->{"$uri/"} || 
    566         exists( $self->relative_links_to_visit->{$uri} ) || 
    567  
    568         #                   $self->relative_links_to_visit->{"$uri/"} || 
    569         exists( $self->links_ignored->{$uri} ) || 
    570  
    571         #                   $self->links_ignored->{"$uri/"} || 
    572         exists( $self->links_visited->{$uri} ) 
    573       ) 
    574     { 
    575  
    576         #                   $self->links_visited->{"$uri/"} ){ 
    577         return 1; 
    578     } 
    579  
    580     #       } 
    581     #       else{ 
    582     #try without a trailing slash 
    583     #           $uri =~ s/\/$//; 
    584     #           if(    $self->links_to_visit->{$uri} || 
    585     #                  $self->relative_links_to_visit->{$uri} || 
    586     #                  $self->links_ignored->{$uri} || 
    587     #                  $self->links_visited->{$uri} ){ 
    588     #               return 1; 
    589     #           } 
    590     #       } 
    591  
    592     return 0; 
    593 
    594  
    595 ############################################################################### 
    596 # Public Methods Implemented                                                  # 
    597 ############################################################################### 
    598  
    599 =pod 
    600  
    601 =head1 METHODS IMPLEMENTED 
    602  
    603 The following functions have been implemented by the FF driver.  Many 
    604 of these methods were implementations of the parent Driver interface. 
    605  
    606 As such, the following code descriptions pertain to this particular  
    607 Driver implementation.  For further information about the generic 
    608 Driver interface, see the L<HoneyClient::Agent::Driver> documentation. 
    609  
    610 =head2 HoneyClient::Agent::Driver::FF->new($param => $value, ...) 
    611  
    612 =over 4 
    613  
    614 Creates a new FF driver object, which contains a hashtable 
    615 containing any of the supplied "param => value" arguments. 
    616  
    617 I<Inputs>: 
    618  B<$param> is an optional parameter variable. 
    619  B<$value> is $param's corresponding value. 
    620   
    621 Note: If any $param(s) are supplied, then an equal number of 
    622 corresponding $value(s) B<must> also be specified. 
    623  
    624 I<Output>: The instantiated FF driver B<$object>, fully initialized. 
    625  
    626 =back 
    627  
    628 =begin testing 
    629  
    630 # XXX: Add this. 
     238 
     239####################################################################### 
     240 
    6312411; 
    632  
    633 =end testing 
    634  
    635 =cut 
    636  
    637 sub new { 
    638  
    639     # - This function takes in an optional hashtable, 
    640     #   that contains various key => 'value' configuration 
    641     #   parameters. 
    642     # 
    643     # - For each parameter given, it overwrites any corresponding 
    644     #   parameters specified within the default hashtable, %PARAMS, 
    645     #   with custom entries that were given as parameters. 
    646     # 
    647     # - Finally, it returns a blessed instance of the 
    648     #   merged hashtable, as an 'object'. 
    649  
    650     # Get the class name. 
    651     my $self = shift; 
    652  
    653     # Get the rest of the arguments, as a hashtable. 
    654     # Hash-based arguments are used, since HoneyClient::Util::SOAP is unable to handle 
    655     # hash references directly.  Thus, flat hashtables are used throughout the code 
    656     # for consistency. 
    657     my %args = @_; 
    658  
    659     # Check to see if the class name is inherited or defined. 
    660     my $class = ref($self) || $self; 
    661  
    662     # Initialize default parameters. 
    663     my %params = %{ dclone( \%PARAMS ) }; 
    664     $self = $class->SUPER::new(); 
    665     @{$self}{ keys %params } = values %params; 
    666  
    667     # Now, overwrite any default parameters that were redefined 
    668     # in the supplied arguments. 
    669     @{$self}{ keys %args } = values %args; 
    670  
    671     # Now, assign our object the appropriate namespace. 
    672     bless $self, $class; 
    673      
    674     #bla 
    675     $self->{_remaining_number_of_relative_links_to_visit} =  
    676         $self->{max_relative_links_to_visit}; 
    677  
    678     # Finally, return the blessed object.