| 273 | | # Get any urls currently assigned to this host |
|---|
| 274 | | # Calculate the time which urls must be older than |
|---|
| 275 | | #XXX: Hardcoded url num values and age, but will eventually be configuration variables |
|---|
| 276 | | min_age_by_min = 5 |
|---|
| 277 | | older_than_time = Time.now.to_f - min_age_by_min*60 |
|---|
| 278 | | |
|---|
| 279 | | # Get 60% of requested URLs by Highest Priority |
|---|
| 280 | | #num_by_priority = (num_urls*0.6).ceil |
|---|
| 281 | | num_by_priority = 12 |
|---|
| | 269 | # Get any URLs currently assigned to this host. |
|---|
| | 270 | # Calculate the time which URLs must be older than. |
|---|
| | 271 | older_than_time = Time.now.to_f - (Configuration.get(:name => "revisit_delay", :namespace => "QueueUrl").to_i * 60) |
|---|
| | 272 | |
|---|
| | 273 | # Get the minimum number of URLs that should be assigned. |
|---|
| | 274 | min_num_urls = Configuration.get(:name => "min_num_urls", :namespace => "QueueUrl").to_i |
|---|
| | 275 | |
|---|
| | 276 | # Keep track of the remaining number of URLs to assign. |
|---|
| | 277 | remaining_num_urls = min_num_urls |
|---|
| | 278 | |
|---|
| | 279 | # Get a percentage of requested URLs by highest priority. |
|---|
| | 280 | # Calculate the percentage needed. |
|---|
| | 281 | num_by_priority = ((Configuration.get(:name => "percentage_urls_by_priority", :namespace => "QueueUrl").to_f / 100) * min_num_urls).ceil |
|---|
| | 282 | # Make sure we have enough remaining slots to fill. |
|---|
| | 283 | num_by_priority = (num_by_priority > remaining_num_urls) ? remaining_num_urls : num_by_priority |
|---|
| | 284 | # Update the number of slots we have left. |
|---|
| | 285 | remaining_num_urls = remaining_num_urls - num_by_priority |
|---|
| | 286 | |
|---|
| | 287 | # Get the URLs. |
|---|
| 286 | | # If no urls are retrieved in first query, avoid further queries |
|---|
| | 292 | # Only perform additional lookups, when we still have slots left and when the last query didn't |
|---|
| | 293 | # give us an unexpected empty set of URLs. |
|---|
| | 294 | if ((remaining_num_urls > 0) && (((num_by_priority > 0) && (urls.length > 0)) || (num_by_priority <= 0))) |
|---|
| | 295 | # Get a percentage of requested URLs by age. |
|---|
| | 296 | # Calculate the percentage needed. |
|---|
| | 297 | num_by_age = ((Configuration.get(:name => "percentage_urls_by_age", :namespace => "QueueUrl").to_f / 100) * min_num_urls).ceil |
|---|
| | 298 | # Make sure we have enough remaining slots to fill. |
|---|
| | 299 | num_by_age = (num_by_age > remaining_num_urls) ? remaining_num_urls : num_by_age |
|---|
| | 300 | # Update the number of slots we have left. |
|---|
| | 301 | remaining_num_urls = remaining_num_urls - num_by_age |
|---|
| | 302 | |
|---|
| | 303 | if (urls.length > 0) |
|---|
| | 304 | urls += QueueUrl.find(:all, |
|---|
| | 305 | :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time,:id_not_in => urls.map(&:id)}, |
|---|
| | 306 | :order => "created_at ASC",:limit => num_by_age) |
|---|
| | 307 | else |
|---|
| | 308 | urls += QueueUrl.find(:all, |
|---|
| | 309 | :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time}, |
|---|
| | 310 | :order => "created_at ASC",:limit => num_by_age) |
|---|
| | 311 | end |
|---|
| | 312 | |
|---|
| | 313 | |
|---|
| | 314 | # Only perform additional lookups, when we still have slots left and when the last query didn't |
|---|
| | 315 | # give us an unexpected empty set of URLs. |
|---|
| | 316 | if ((remaining_num_urls > 0) && (((num_by_age > 0) && (urls.length > 0)) || (num_by_age <= 0))) |
|---|
| | 317 | # Get a percentage of requested URLs by popularity. |
|---|
| | 318 | # Calculate the percentage needed. |
|---|
| | 319 | #num_by_popularity = ((Configuration.get(:name => "percentage_urls_by_popularity", :namespace => "QueueUrl").to_f / 100) * min_num_urls).ceil |
|---|
| | 320 | # Make sure we have enough remaining slots to fill. |
|---|
| | 321 | #num_by_popularity = (num_by_popularity > remaining_num_urls) ? remaining_num_urls : num_by_popularity |
|---|
| | 322 | # XXX: Optimization: As the last criteria, we make sure we use up all our slots. |
|---|
| | 323 | num_by_popularity = remaining_num_urls |
|---|
| | 324 | # Update the number of slots we have left. |
|---|
| | 325 | remaining_num_urls = remaining_num_urls - num_by_popularity |
|---|
| | 326 | |
|---|
| | 327 | if (urls.length > 0) |
|---|
| | 328 | urls += QueueUrl.find(:all, |
|---|
| | 329 | :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time,:id_not_in => urls.map(&:id)}, |
|---|
| | 330 | :order => "count DESC",:limit => num_by_popularity) |
|---|
| | 331 | else |
|---|
| | 332 | urls += QueueUrl.find(:all, |
|---|
| | 333 | :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time}, |
|---|
| | 334 | :order => "count DESC",:limit => num_by_popularity) |
|---|
| | 335 | end |
|---|
| | 336 | |
|---|
| | 337 | end |
|---|
| | 338 | end |
|---|
| | 339 | |
|---|
| 288 | | # Get 20% of requested URLs by Age |
|---|
| 289 | | #num_by_age = (num_urls > num_by_priority) ? (num_urls*0.4).ceil : 0 |
|---|
| 290 | | num_by_age = 4 |
|---|
| 291 | | urls += QueueUrl.find(:all, |
|---|
| 292 | | :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time,:id_not_in => urls.map(&:id)}, |
|---|
| 293 | | :order => "created_at ASC",:limit => num_by_age) |
|---|
| 294 | | |
|---|
| 295 | | # Get 20% of requested URLs by Popularity |
|---|
| 296 | | #num_by_popularity = (r = num_urls - num_by_priority - num_by_age) > 0 ? r : 0 |
|---|
| 297 | | num_by_popularity = 4 |
|---|
| 298 | | urls += QueueUrl.find(:all, |
|---|
| 299 | | :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time,:id_not_in => urls.map(&:id)}, |
|---|
| 300 | | :order => "count DESC",:limit => num_by_popularity) |
|---|
| 301 | | |
|---|
| 302 | | # Lock urls to prevent duplication of work by setting host_id (!0) |
|---|
| | 341 | # Lock URLs to prevent duplication of work, by setting host_id (!=0). |
|---|