| 976 | | $ua->default_header( 'Accept' => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5' ); |
|---|
| 977 | | |
|---|
| 978 | | my $response = $ua->request( |
|---|
| 979 | | HTTP::Request->new( |
|---|
| 980 | | GET => $args{'url'}, |
|---|
| 981 | | HTTP::Headers->new( |
|---|
| 982 | | # TODO: Add custom headers here? |
|---|
| 983 | | 'Accept' => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', |
|---|
| 984 | | ), |
|---|
| 985 | | ) |
|---|
| 986 | | ); |
|---|
| | 970 | $ua->default_header( 'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, */*', |
|---|
| | 971 | 'Accept-Language' => 'en-us, en-uk', |
|---|
| | 972 | 'User-Agent' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', |
|---|
| | 973 | 'Proxy-Connection' => 'Keep-Alive', |
|---|
| | 974 | ); |
|---|
| | 975 | |
|---|
| | 976 | my $response = $ua->get($args{'url'}); |
|---|
| 1001 | | # Check to see if the content is Flash-based. |
|---|
| 1002 | | if ($type eq "application/x-shockwave-flash") { |
|---|
| 1003 | | |
|---|
| 1004 | | # Save content to a temp file on disk. |
|---|
| 1005 | | my $tempFile = new File::Temp(SUFFIX => '.swf'); |
|---|
| 1006 | | print $tempFile $content; |
|---|
| 1007 | | $tempFile->close(); |
|---|
| 1008 | | |
|---|
| 1009 | | %scored_links = HoneyClient::Agent::Driver::ActiveContent::process( |
|---|
| 1010 | | file => $tempFile, |
|---|
| 1011 | | base_url => $base, |
|---|
| 1012 | | ); |
|---|
| 1013 | | |
|---|
| 1014 | | # TODO: Check to make sure that temp files are getting deleted properly |
|---|
| 1015 | | # (when normal conditions apply). |
|---|
| 1016 | | |
|---|
| 1017 | | # Assume that all other content types are HTML-based. |
|---|
| 1018 | | } else { |
|---|
| 1019 | | # Score the new links based on their surrounding HTML context |
|---|
| 1020 | | # Extract the good word and bad word lists into arrays; |
|---|
| 1021 | | my @good_words = split /,/, $self->goodwords; |
|---|
| 1022 | | my @bad_words = split /,/, $self->badwords; |
|---|
| 1023 | | my %wordlists = ('good' => \@good_words, 'bad' => \@bad_words); |
|---|
| 1024 | | # Call the link scoring function |
|---|
| 1025 | | %scored_links = _scoreLinks($base, $content, %wordlists); |
|---|
| 1026 | | } |
|---|
| | 990 | # Extract the good word and bad word lists into arrays; |
|---|
| | 991 | my @good_words = split /,/, $self->goodwords; |
|---|
| | 992 | my @bad_words = split /,/, $self->badwords; |
|---|
| | 993 | my %wordlists = ('good' => \@good_words, 'bad' => \@bad_words); |
|---|
| | 994 | # Call the link scoring function |
|---|
| | 995 | %scored_links = _scoreLinks($base, $content, %wordlists); |
|---|
| 1302 | | if (!$content) { |
|---|
| 1303 | | return %links; |
|---|
| 1304 | | } |
|---|
| 1305 | | |
|---|
| 1306 | | # TODO: Extract absolute and relative URLs from <EMBED> tags, that would render |
|---|
| 1307 | | # Flash animations inline to the webpage. |
|---|
| 1308 | | |
|---|
| 1309 | | # Begin to scour the HTML content for <a> tags, parsing attributes and text |
|---|
| 1310 | | while ($content =~ m{<a\b([^>]+)>(.*?)</a>}ig) { |
|---|
| 1311 | | my $attr = $1; |
|---|
| 1312 | | my $text = $2; |
|---|
| 1313 | | my $score = 0; |
|---|
| 1314 | | |
|---|
| 1315 | | # Look for the link in the attribute data |
|---|
| 1316 | | if ($attr =~ m{ |
|---|
| 1317 | | \b HREF |
|---|
| 1318 | | \s* = \s* |
|---|
| 1319 | | (?: |
|---|
| 1320 | | "([^"]*)" |
|---|
| 1321 | | | |
|---|
| 1322 | | '([^']*)' |
|---|
| 1323 | | | |
|---|
| 1324 | | {[^'">\s]+} |
|---|
| 1325 | | ) |
|---|
| 1326 | | }xi) |
|---|
| 1327 | | { |
|---|
| 1328 | | $url = $+; |
|---|
| 1329 | | |
|---|
| 1330 | | # Some programmatic values |
|---|
| 1331 | | my $min_text_length = 6; |
|---|
| 1332 | | my $max_text_length = 20; |
|---|
| 1333 | | my $image_bonus = 50; |
|---|
| 1334 | | my $default_display_size = 1024 * 768; |
|---|
| 1335 | | my $word_value = 6; |
|---|
| 1336 | | |
|---|
| 1337 | | # We have to make this an absolute url (if it's not) |
|---|
| 1338 | | # before using it as a key in the %links hash |
|---|
| 1339 | | $url = url($url, $base)->abs; |
|---|
| 1340 | | |
|---|
| 1341 | | # The link must be an HREF and be a http(s) link |
|---|
| 1342 | | if ($url =~ /^http/i) { |
|---|
| 1343 | | # Begin scoring the link based on surrounding context |
|---|
| 1344 | | # This can be improved/customized in many different ways. |
|---|
| 1345 | | # Our implementation is only one possible way to assign |
|---|
| 1346 | | # values to the context elements. |
|---|
| 1347 | | |
|---|
| 1348 | | # Score length of link text. These are arbitrary lengths, but |
|---|
| 1349 | | # the reasoning is that really short text links are not too |
|---|
| 1350 | | # visible (we are excluding image links from this criteria), |
|---|
| 1351 | | # and really long text would be weird or abnormal to the human |
|---|
| 1352 | | # web surfer. |
|---|
| 1353 | | if ($text !~ /img /i && |
|---|
| 1354 | | length($text) > $min_text_length && |
|---|
| 1355 | | length($text) < $max_text_length) { |
|---|
| 1356 | | $score += length($text); |
|---|
| 1357 | | } |
|---|
| 1358 | | |
|---|
| 1359 | | # Score the image content, if it exists |
|---|
| 1360 | | # We score the size proportional to a 1024 X 768 display |
|---|
| 1361 | | # Image bonus |
|---|
| 1362 | | if ($text =~ /img /i) { |
|---|
| 1363 | | $score += $image_bonus; |
|---|
| 1364 | | } |
|---|
| 1365 | | # Score image size |
|---|
| 1366 | | my $width; |
|---|
| 1367 | | my $height; |
|---|
| 1368 | | if ($text =~ /\b WIDTH\s*=\s*.(\d+)/xi) { |
|---|
| 1369 | | $width = $1; |
|---|
| 1370 | | } |
|---|
| 1371 | | if ($text =~ /\b HEIGHT\s*=\s*.(\d+)/xi) { |
|---|
| 1372 | | $height = $1; |
|---|
| 1373 | | } |
|---|
| 1374 | | if ($width && $height) { |
|---|
| 1375 | | $score += int(($width*$height)/($default_display_size)*100); |
|---|
| 1376 | | } |
|---|
| 1377 | | elsif ($width) { |
|---|
| 1378 | | $score += int($width/10); |
|---|
| 1379 | | } |
|---|
| 1380 | | elsif ($height) { |
|---|
| 1381 | | $score += int($height/10); |
|---|
| 1382 | | } |
|---|
| 1383 | | |
|---|
| 1384 | | # Good word bonus |
|---|
| 1385 | | foreach (@good_words) { |
|---|
| 1386 | | if ($text =~ /$_/i) { |
|---|
| 1387 | | $score += $word_value; |
|---|
| 1388 | | } |
|---|
| 1389 | | } |
|---|
| 1390 | | |
|---|
| 1391 | | # Bad word penalty |
|---|
| 1392 | | foreach (@bad_words) { |
|---|
| 1393 | | if ($text =~ /$_/i) { |
|---|
| 1394 | | $score -= $word_value; |
|---|
| 1395 | | } |
|---|
| 1396 | | } |
|---|
| 1397 | | |
|---|
| 1398 | | # Put it in the return value hash and zero the score |
|---|
| 1399 | | $links{$url} = $score; |
|---|
| 1400 | | $url = undef; |
|---|
| 1401 | | } |
|---|
| 1402 | | } |
|---|
| 1403 | | } |
|---|
| 1404 | | return %links; |
|---|
| | 1270 | if ( !$content ) { |
|---|
| | 1271 | return %links; |
|---|
| | 1272 | } |
|---|
| | 1273 | |
|---|
| | 1274 | # Begin to scour the HTML content one line at a time. |
|---|
| | 1275 | # Get wanted tags, parsing type (which), attributes and text |
|---|
| | 1276 | while ( $content =~ /(.+)\n/g ) { |
|---|
| | 1277 | my $line = $1; |
|---|
| | 1278 | if ( !defined($line) ) { |
|---|
| | 1279 | # Empty line |
|---|
| | 1280 | next; |
|---|
| | 1281 | } |
|---|
| | 1282 | my ( $which, $attr, $text ); |
|---|
| | 1283 | # Process all tags on the line |
|---|
| | 1284 | while ( $line =~ m{<(a|embed)\b([^>]+)>(.*?)</(a|embed)>}ig ) { |
|---|
| | 1285 | $which = $1; |
|---|
| | 1286 | $attr = $2; |
|---|
| | 1287 | $text = $3; |
|---|
| | 1288 | my ( $url, $score ); |
|---|
| | 1289 | if ($which eq "a") { |
|---|
| | 1290 | ($url, $score) = _addHTMLLink( $base, $attr, $text, @good_words, @bad_words ); |
|---|
| | 1291 | } |
|---|
| | 1292 | elsif ($which eq "embed") { |
|---|
| | 1293 | ($url, $score) = _addFlashLink( $base, $attr, $text ); |
|---|
| | 1294 | } |
|---|
| | 1295 | $links{$url} = $score unless !defined($url); |
|---|
| | 1296 | } |
|---|
| | 1297 | } |
|---|
| | 1298 | |
|---|
| | 1299 | return %links; |
|---|
| | 1300 | } |
|---|
| | 1301 | |
|---|
| | 1302 | sub _addFlashLink { |
|---|
| | 1303 | |
|---|
| | 1304 | my ( $base, $attr, $text ) = @_; |
|---|
| | 1305 | my $source; |
|---|
| | 1306 | my $type; |
|---|
| | 1307 | my $score = 50; |
|---|
| | 1308 | |
|---|
| | 1309 | # Look for the link in the attribute data |
|---|
| | 1310 | if ( |
|---|
| | 1311 | $attr =~ m{ |
|---|
| | 1312 | \b src |
|---|
| | 1313 | \s* = \s* |
|---|
| | 1314 | (?: |
|---|
| | 1315 | "([^"]*)" |
|---|
| | 1316 | | |
|---|
| | 1317 | '([^']*)' |
|---|
| | 1318 | | |
|---|
| | 1319 | {[^'">\s]+} |
|---|
| | 1320 | ) |
|---|
| | 1321 | }xi |
|---|
| | 1322 | ) |
|---|
| | 1323 | { |
|---|
| | 1324 | $source = $+; |
|---|
| | 1325 | } |
|---|
| | 1326 | |
|---|
| | 1327 | # Look for the link in the attribute data |
|---|
| | 1328 | if ( |
|---|
| | 1329 | $attr =~ m{ |
|---|
| | 1330 | \b type |
|---|
| | 1331 | \s* = \s* |
|---|
| | 1332 | (?: |
|---|
| | 1333 | "([^"]*)" |
|---|
| | 1334 | | |
|---|
| | 1335 | '([^']*)' |
|---|
| | 1336 | | |
|---|
| | 1337 | {[^'">\s]+} |
|---|
| | 1338 | ) |
|---|
| | 1339 | }xi |
|---|
| | 1340 | ) |
|---|
| | 1341 | { |
|---|
| | 1342 | $type = $+; |
|---|
| | 1343 | $type = lc($type); |
|---|
| | 1344 | } |
|---|
| | 1345 | |
|---|
| | 1346 | if ( !defined($type) ) { |
|---|
| | 1347 | $type = "unknown"; |
|---|
| | 1348 | } |
|---|
| | 1349 | if ( !defined($source) ) { |
|---|
| | 1350 | $source = "unknown"; |
|---|
| | 1351 | } |
|---|
| | 1352 | |
|---|
| | 1353 | if ( ( $type eq "application/x-shockwave-flash" ) |
|---|
| | 1354 | || ( $source =~ /.swf$/ ) ) |
|---|
| | 1355 | { |
|---|
| | 1356 | # Make this an absolute URL |
|---|
| | 1357 | $source = url( $source, $base )->abs; |
|---|
| | 1358 | return ( $source, $score ); |
|---|
| | 1359 | } |
|---|
| | 1360 | return undef; |
|---|
| | 1361 | } |
|---|
| | 1362 | |
|---|
| | 1363 | sub _addHTMLLink { |
|---|
| | 1364 | |
|---|
| | 1365 | my ( $base, $attr, $text, @good_words, @bad_words ) = @_; |
|---|
| | 1366 | my $score = 0; |
|---|
| | 1367 | my $url; |
|---|
| | 1368 | |
|---|
| | 1369 | # Look for the link in the attribute data |
|---|
| | 1370 | if ( |
|---|
| | 1371 | $attr =~ m{ |
|---|
| | 1372 | \b HREF |
|---|
| | 1373 | \s* = \s* |
|---|
| | 1374 | (?: |
|---|
| | 1375 | "([^"]*)" |
|---|
| | 1376 | | |
|---|
| | 1377 | '([^']*)' |
|---|
| | 1378 | | |
|---|
| | 1379 | {[^'">\s]+} |
|---|
| | 1380 | ) |
|---|
| | 1381 | }xi |
|---|
| | 1382 | ) |
|---|
| | 1383 | { |
|---|
| | 1384 | $url = $+; |
|---|
| | 1385 | |
|---|
| | 1386 | # Some programmatic values |
|---|
| | 1387 | my $min_text_length = 6; |
|---|
| | 1388 | my $max_text_length = 20; |
|---|
| | 1389 | my $image_bonus = 50; |
|---|
| | 1390 | my $default_display_size = 1024 * 768; |
|---|
| | 1391 | my $word_value = 6; |
|---|
| | 1392 | |
|---|
| | 1393 | # We have to make this an absolute url (if it's not) |
|---|
| | 1394 | # before using it as a key in the %links hash |
|---|
| | 1395 | $url = url( $url, $base )->abs; |
|---|
| | 1396 | |
|---|
| | 1397 | # The link must be an HREF and be a http(s) link |
|---|
| | 1398 | if ( $url =~ /^http/i ) { |
|---|
| | 1399 | |
|---|
| | 1400 | # Begin scoring the link based on surrounding context |
|---|
| | 1401 | # This can be improved/customized in many different ways. |
|---|
| | 1402 | # Our implementation is only one possible way to assign |
|---|
| | 1403 | # values to the context elements. |
|---|
| | 1404 | |
|---|
| | 1405 | # Score length of link text. These are arbitrary lengths, but |
|---|
| | 1406 | # the reasoning is that really short text links are not too |
|---|
| | 1407 | # visible (we are excluding image links from this criteria), |
|---|
| | 1408 | # and really long text would be weird or abnormal to the human |
|---|
| | 1409 | # web surfer. |
|---|
| | 1410 | if ( $text !~ /img /i |
|---|
| | 1411 | && length($text) > $min_text_length |
|---|
| | 1412 | && length($text) < $max_text_length ) |
|---|
| | 1413 | { |
|---|
| | 1414 | $score += length($text); |
|---|
| | 1415 | } |
|---|
| | 1416 | |
|---|
| | 1417 | # Score the image content, if it exists |
|---|
| | 1418 | # We score the size proportional to a 1024 X 768 display |
|---|
| | 1419 | # Image bonus |
|---|
| | 1420 | if ( $text =~ /img /i ) { |
|---|
| | 1421 | $score += $image_bonus; |
|---|
| | 1422 | } |
|---|
| | 1423 | |
|---|
| | 1424 | # Score image size |
|---|
| | 1425 | my $width; |
|---|
| | 1426 | my $height; |
|---|
| | 1427 | if ( $text =~ /\b WIDTH\s*=\s*.(\d+)/xi ) { |
|---|
| | 1428 | $width = $1; |
|---|
| | 1429 | } |
|---|
| | 1430 | if ( $text =~ /\b HEIGHT\s*=\s*.(\d+)/xi ) { |
|---|
| | 1431 | $height = $1; |
|---|
| | 1432 | } |
|---|
| | 1433 | if ( $width && $height ) { |
|---|
| | 1434 | $score += |
|---|
| | 1435 | int( ( $width * $height ) / ($default_display_size) * 100 ); |
|---|
| | 1436 | } |
|---|
| | 1437 | elsif ($width) { |
|---|
| | 1438 | $score += int( $width / 10 ); |
|---|
| | 1439 | } |
|---|
| | 1440 | elsif ($height) { |
|---|
| | 1441 | $score += int( $height / 10 ); |
|---|
| | 1442 | } |
|---|
| | 1443 | |
|---|
| | 1444 | # Good word bonus |
|---|
| | 1445 | foreach (@good_words) { |
|---|
| | 1446 | if ( $text =~ /$_/i ) { |
|---|
| | 1447 | $score += $word_value; |
|---|
| | 1448 | } |
|---|
| | 1449 | } |
|---|
| | 1450 | |
|---|
| | 1451 | # Bad word penalty |
|---|
| | 1452 | foreach (@bad_words) { |
|---|
| | 1453 | if ( $text =~ /$_/i ) { |
|---|
| | 1454 | $score -= $word_value; |
|---|
| | 1455 | } |
|---|
| | 1456 | } |
|---|
| | 1457 | |
|---|
| | 1458 | # Put it in the return value hash and zero the score |
|---|
| | 1459 | return ( $url, $score ); |
|---|
| | 1460 | } |
|---|
| | 1461 | } |
|---|
| | 1462 | return undef; |
|---|