Changeset 1574

Show
Ignore:
Timestamp:
05/07/08 18:47:13 (5 days ago)
Author:
kindlund
Message:

Part of Ticket #165 - Externalized hardcoded variables into the 'configurations' table in the database.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • hive/trunk/data_webapp/app/controllers/hc_database_controller.rb

    r1561 r1574  
    7979    client = Client.find(cid) 
    8080    hid = client.host_id 
    81      
    82     #XXX: Get source information 
    83     #stype = obj_hash.delete("source_type") or "unknown" 
    84     #sname = obj_hash.delete("source_name") 
    8581 
    8682    bee_work = {"cid" => client.cid, "urls" => []} 
     
    268264  # Retrieve num_urls links for client with id client_id 
    269265  def get_new_queue_urls(hostname,num_urls) 
    270     # Get requesting Host object 
     266    # Get requesting Host object. 
    271267    host = Host.find(:first, :conditions => {:hostname => hostname}) 
    272268 
    273     # Get any urls currently assigned to this host 
    274     # Calculate the time which urls must be older than 
    275     #XXX: Hardcoded url num values and age, but will eventually be configuration variables 
    276     min_age_by_min = 5 
    277     older_than_time = Time.now.to_f - min_age_by_min*60 
    278  
    279     # Get 60% of requested URLs by Highest Priority 
    280     #num_by_priority = (num_urls*0.6).ceil 
    281     num_by_priority = 12 
     269    # Get any URLs currently assigned to this host. 
     270    # Calculate the time which URLs must be older than. 
     271    older_than_time = Time.now.to_f - (Configuration.get(:name => "revisit_delay", :namespace => "QueueUrl").to_i * 60)  
     272 
     273    # Get the minimum number of URLs that should be assigned. 
     274    min_num_urls = Configuration.get(:name => "min_num_urls", :namespace => "QueueUrl").to_i 
     275     
     276    # Keep track of the remaining number of URLs to assign. 
     277    remaining_num_urls = min_num_urls 
     278 
     279    # Get a percentage of requested URLs by highest priority. 
     280    # Calculate the percentage needed. 
     281    num_by_priority = ((Configuration.get(:name => "percentage_urls_by_priority", :namespace => "QueueUrl").to_f / 100) * min_num_urls).ceil 
     282    # Make sure we have enough remaining slots to fill. 
     283    num_by_priority = (num_by_priority > remaining_num_urls) ? remaining_num_urls : num_by_priority 
     284    # Update the number of slots we have left. 
     285    remaining_num_urls = remaining_num_urls - num_by_priority 
     286 
     287    # Get the URLs. 
    282288    urls = QueueUrl.find(:all, 
    283289      :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time}, 
    284290      :order => "priority DESC",:limit => num_by_priority) 
    285291 
    286     # If no urls are retrieved in first query, avoid further queries 
     292    # Only perform additional lookups, when we still have slots left and when the last query didn't 
     293    # give us an unexpected empty set of URLs. 
     294    if ((remaining_num_urls > 0) && (((num_by_priority > 0) && (urls.length > 0)) || (num_by_priority <= 0))) 
     295      # Get a percentage of requested URLs by age. 
     296      # Calculate the percentage needed. 
     297      num_by_age = ((Configuration.get(:name => "percentage_urls_by_age", :namespace => "QueueUrl").to_f / 100) * min_num_urls).ceil 
     298      # Make sure we have enough remaining slots to fill. 
     299      num_by_age = (num_by_age > remaining_num_urls) ? remaining_num_urls : num_by_age 
     300      # Update the number of slots we have left. 
     301      remaining_num_urls = remaining_num_urls - num_by_age 
     302 
     303      if (urls.length > 0) 
     304        urls += QueueUrl.find(:all, 
     305          :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time,:id_not_in => urls.map(&:id)}, 
     306          :order => "created_at ASC",:limit => num_by_age) 
     307      else 
     308        urls += QueueUrl.find(:all, 
     309          :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time}, 
     310          :order => "created_at ASC",:limit => num_by_age) 
     311      end 
     312 
     313 
     314      # Only perform additional lookups, when we still have slots left and when the last query didn't 
     315      # give us an unexpected empty set of URLs. 
     316      if ((remaining_num_urls > 0) && (((num_by_age > 0) && (urls.length > 0)) || (num_by_age <= 0))) 
     317        # Get a percentage of requested URLs by popularity. 
     318        # Calculate the percentage needed. 
     319        #num_by_popularity = ((Configuration.get(:name => "percentage_urls_by_popularity", :namespace => "QueueUrl").to_f / 100) * min_num_urls).ceil 
     320        # Make sure we have enough remaining slots to fill. 
     321        #num_by_popularity = (num_by_popularity > remaining_num_urls) ? remaining_num_urls : num_by_popularity 
     322        # XXX: Optimization: As the last criteria, we make sure we use up all our slots. 
     323        num_by_popularity = remaining_num_urls 
     324        # Update the number of slots we have left. 
     325        remaining_num_urls = remaining_num_urls - num_by_popularity 
     326 
     327        if (urls.length > 0) 
     328          urls += QueueUrl.find(:all, 
     329            :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time,:id_not_in => urls.map(&:id)}, 
     330            :order => "count DESC",:limit => num_by_popularity) 
     331        else 
     332          urls += QueueUrl.find(:all, 
     333            :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time}, 
     334            :order => "count DESC",:limit => num_by_popularity) 
     335        end 
     336 
     337      end 
     338    end 
     339 
    287340    if urls.length > 0 
    288       # Get 20% of requested URLs by Age 
    289       #num_by_age = (num_urls > num_by_priority) ? (num_urls*0.4).ceil : 0 
    290       num_by_age = 4 
    291       urls += QueueUrl.find(:all, 
    292         :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time,:id_not_in => urls.map(&:id)}, 
    293         :order => "created_at ASC",:limit => num_by_age) 
    294  
    295       # Get 20% of requested URLs by Popularity 
    296       #num_by_popularity = (r = num_urls - num_by_priority - num_by_age) > 0 ? r : 0 
    297       num_by_popularity = 4 
    298       urls += QueueUrl.find(:all, 
    299         :conditions => {:host_id => 0,:last_visited_at_lt => older_than_time,:id_not_in => urls.map(&:id)}, 
    300         :order => "count DESC",:limit => num_by_popularity) 
    301  
    302       # Lock urls to prevent duplication of work by setting host_id (!0) 
     341      # Lock URLs to prevent duplication of work, by setting host_id (!=0). 
    303342      QueueUrl.update_all('host_id='+host.id.to_s,'id IN ('+urls.map(&:id).join(',')+')') 
    304343    end 
    305     # Return the urls as a hash table url/priority pairs (e.g. {"http://www.honeyclient.org" => 1}) 
     344 
     345    # Return the URLs as a hash table of url/priority pairs (e.g. {"http://www.honeyclient.org" => 1}) 
    306346    url_hash = Hash[*urls.collect {|u| [u.url,u.priority]}.flatten] 
    307347    RbYAML.dump(url_hash) 
  • hive/trunk/data_webapp/app/models/configuration.rb

    r1572 r1574  
    11class Configuration < ActiveRecord::Base 
     2 
     3  # Get accepts two required arguments: 
     4  #  
     5  # * <tt>:name</tt>: The name of the variable to look for. 
     6  # * <tt>:namespace</tt>: The namespace to search within. 
     7  # 
     8  # If any configuration entry contains a matching name and 
     9  # namespace, then the corresponding value is returned. 
     10  # 
     11  # Otherwise, nil will be returned. 
     12  def self.get(args = {}) 
     13    obj = Configuration.find(:first, :conditions => args) 
     14    return obj.nil? ? nil : obj.value 
     15  end 
     16 
    217end 
  • hive/trunk/data_webapp/db/migrate/025_create_configurations.rb

    r1573 r1574  
    55      t.column :name, :string 
    66      t.column :value, :string 
     7      t.column :namespace, :string 
     8      t.column :description, :string 
     9      t.column :default_value, :string 
    710    end 
     11    add_index :configurations, :name 
     12    add_index :configurations, :namespace 
    813  end 
    914 
    1015  def self.down 
     16    remove_index :configurations, :name 
     17    remove_index :configurations, :namespace 
    1118    drop_table :configurations 
    1219  end