| 1282 | | # Begin to scour the HTML content for <a> tags, parsing attributes and text |
|---|
| 1283 | | while ($content =~ m{<a\b([^>]+)>(.*?)</a>}ig) { |
|---|
| 1284 | | my $attr = $1; |
|---|
| 1285 | | my $text = $2; |
|---|
| | 1281 | # Begin to scour the HTML content for tags, parsing attributes and text |
|---|
| | 1282 | # Any tag which has an HREF, IMG, or SRC attribute could potentially |
|---|
| | 1283 | # have a url of interest, either for scoring or for punching a hole in |
|---|
| | 1284 | # the firewall. |
|---|
| | 1285 | while ($content =~ m{<(IFRAME|A|LINK|IMG|OBJECT|EMBED|SCRIPT)\b([^>]+)>(.*?)</(\1)>}ig) { |
|---|
| | 1286 | my $attr = $2; |
|---|
| | 1287 | my $text = $3; |
|---|
| 1314 | | # The link must be an HREF and be a http(s) link |
|---|
| 1315 | | if ($url =~ /^http/i) { |
|---|
| 1316 | | # Begin scoring the link based on surrounding context |
|---|
| 1317 | | # This can be improved/customized in many different ways. |
|---|
| 1318 | | # Our implementation is only one possible way to assign |
|---|
| 1319 | | # values to the context elements. |
|---|
| 1320 | | |
|---|
| 1321 | | # Score length of link text. These are arbitrary lengths, but |
|---|
| 1322 | | # the reasoning is that really short text links are not too |
|---|
| 1323 | | # visible (we are excluding image links from this criteria), |
|---|
| 1324 | | # and really long text would be weird or abnormal to the human |
|---|
| 1325 | | # web surfer. |
|---|
| 1326 | | if ($text !~ /img /i && |
|---|
| 1327 | | length($text) > $min_text_length && |
|---|
| 1328 | | length($text) < $max_text_length) { |
|---|
| 1329 | | $score += length($text); |
|---|
| 1330 | | } |
|---|
| 1331 | | |
|---|
| 1332 | | # Score the image content, if it exists |
|---|
| 1333 | | # We score the size proportional to a 1024 X 768 display |
|---|
| 1334 | | # Image bonus |
|---|
| 1335 | | if ($text =~ /img /i) { |
|---|
| 1336 | | $score += $image_bonus; |
|---|
| 1337 | | } |
|---|
| 1338 | | # Score image size |
|---|
| 1339 | | my $width; |
|---|
| 1340 | | my $height; |
|---|
| 1341 | | if ($text =~ /\b WIDTH\s*=\s*.(\d+)/xi) { |
|---|
| 1342 | | $width = $1; |
|---|
| 1343 | | } |
|---|
| 1344 | | if ($text =~ /\b HEIGHT\s*=\s*.(\d+)/xi) { |
|---|
| 1345 | | $height = $1; |
|---|
| 1346 | | } |
|---|
| 1347 | | if ($width && $height) { |
|---|
| 1348 | | $score += int(($width*$height)/($default_display_size)*100); |
|---|
| 1349 | | } |
|---|
| 1350 | | elsif ($width) { |
|---|
| 1351 | | $score += int($width/10); |
|---|
| 1352 | | } |
|---|
| 1353 | | elsif ($height) { |
|---|
| 1354 | | $score += int($height/10); |
|---|
| 1355 | | } |
|---|
| 1356 | | |
|---|
| 1357 | | # Good word bonus |
|---|
| 1358 | | foreach (@good_words) { |
|---|
| 1359 | | if ($text =~ /$_/i) { |
|---|
| 1360 | | $score += $word_value; |
|---|
| 1361 | | } |
|---|
| 1362 | | } |
|---|
| 1363 | | |
|---|
| 1364 | | # Bad word penalty |
|---|
| 1365 | | foreach (@bad_words) { |
|---|
| 1366 | | if ($text =~ /$_/i) { |
|---|
| 1367 | | $score -= $word_value; |
|---|
| 1368 | | } |
|---|
| 1369 | | } |
|---|
| 1370 | | |
|---|
| 1371 | | # Put it in the return value hash and zero the score |
|---|
| 1372 | | $links{$url} = $score; |
|---|
| 1373 | | $url = undef; |
|---|
| | 1316 | # Begin scoring the link based on surrounding context |
|---|
| | 1317 | # This can be improved/customized in many different ways. |
|---|
| | 1318 | # Our implementation is only one possible way to assign |
|---|
| | 1319 | # values to the context elements. |
|---|
| | 1320 | |
|---|
| | 1321 | my $width; |
|---|
| | 1322 | my $height; |
|---|
| | 1323 | # Score the size of an object based on width and height |
|---|
| | 1324 | if ($attr =~ /\b WIDTH\s*=\s*.(\d+)/xi) { |
|---|
| | 1325 | $width = $1; |
|---|
| | 1327 | if ($attr =~ /\b HEIGHT\s*=\s*.(\d+)/xi) { |
|---|
| | 1328 | $height = $1; |
|---|
| | 1329 | } |
|---|
| | 1330 | if ($width && $height) { |
|---|
| | 1331 | $score += int(($width*$height)/($default_display_size)*100); |
|---|
| | 1332 | } |
|---|
| | 1333 | elsif ($width) { |
|---|
| | 1334 | $score += int($width/10); |
|---|
| | 1335 | } |
|---|
| | 1336 | elsif ($height) { |
|---|
| | 1337 | $score += int($height/10); |
|---|
| | 1338 | } |
|---|
| | 1339 | |
|---|
| | 1340 | # Score length of link text. These are arbitrary lengths, but |
|---|
| | 1341 | # the reasoning is that really short text links are not too |
|---|
| | 1342 | # visible (we are excluding image links from this criteria), |
|---|
| | 1343 | # and really long text would be weird or abnormal to the human |
|---|
| | 1344 | # web surfer. |
|---|
| | 1345 | if ($text !~ /img /i && |
|---|
| | 1346 | length($text) > $min_text_length && |
|---|
| | 1347 | length($text) < $max_text_length) { |
|---|
| | 1348 | $score += length($text); |
|---|
| | 1349 | } |
|---|
| | 1350 | |
|---|
| | 1351 | # Score the image content, if it exists |
|---|
| | 1352 | # We score the size proportional to a 1024 X 768 display |
|---|
| | 1353 | # Image bonus |
|---|
| | 1354 | if ($text =~ /img /i) { |
|---|
| | 1355 | $score += $image_bonus; |
|---|
| | 1356 | } |
|---|
| | 1357 | # Score image size |
|---|
| | 1358 | $width = undef; |
|---|
| | 1359 | $height = undef; |
|---|
| | 1360 | if ($text =~ /\b WIDTH\s*=\s*.(\d+)/xi) { |
|---|
| | 1361 | $width = $1; |
|---|
| | 1362 | } |
|---|
| | 1363 | if ($text =~ /\b HEIGHT\s*=\s*.(\d+)/xi) { |
|---|
| | 1364 | $height = $1; |
|---|
| | 1365 | } |
|---|
| | 1366 | if ($width && $height) { |
|---|
| | 1367 | $score += int(($width*$height)/($default_display_size)*100); |
|---|
| | 1368 | } |
|---|
| | 1369 | elsif ($width) { |
|---|
| | 1370 | $score += int($width/10); |
|---|
| | 1371 | } |
|---|
| | 1372 | elsif ($height) { |
|---|
| | 1373 | $score += int($height/10); |
|---|
| | 1374 | } |
|---|
| | 1375 | |
|---|
| | 1376 | # Good word bonus |
|---|
| | 1377 | foreach (@good_words) { |
|---|
| | 1378 | if ($text =~ /$_/i) { |
|---|
| | 1379 | $score += $word_value; |
|---|
| | 1380 | } |
|---|
| | 1381 | } |
|---|
| | 1382 | |
|---|
| | 1383 | # Bad word penalty |
|---|
| | 1384 | foreach (@bad_words) { |
|---|
| | 1385 | if ($text =~ /$_/i) { |
|---|
| | 1386 | $score -= $word_value; |
|---|
| | 1387 | } |
|---|
| | 1388 | } |
|---|
| | 1389 | |
|---|
| | 1390 | # Put it in the return value hash and zero the score |
|---|
| | 1391 | $links{$url} = $score; |
|---|
| | 1392 | $url = undef; |
|---|