jam-cloud/ruby/lib/jam_ruby/models/max_mind_isp.rb

124 lines
4.0 KiB
Ruby

require 'csv'
module JamRuby
class MaxMindIsp < ActiveRecord::Base
self.table_name = 'max_mind_isp'
@@log = Logging.logger[MaxMindIsp]
def self.import_from_max_mind(options)
file = options[:file]
use_copy = options[:use_copy]
# File Geo-142
# Format:
# "beginIp","endIp","countryCode","ISP"
# drop indexes on start, then add them back when done
start = Time.now
MaxMindIsp.delete_all
if use_copy
Database.copy(MaxMind.table_name, file)
else
File.open(file, 'r:ISO-8859-1') do |io|
s = io.gets.strip # eat the copyright line. gah, why do they have that in their file??
unless s.eql? 'Copyright (c) 2011 MaxMind Inc. All Rights Reserved.'
puts s
puts 'Copyright (c) 2011 MaxMind Inc. All Rights Reserved.'
raise 'file does not start with expected copyright (line 1): Copyright (c) 2011 MaxMind Inc. All Rights Reserved.'
end
s = io.gets.strip # eat the headers line
unless s.eql? '"beginIp","endIp","countryCode","ISP"'
puts s
puts '"beginIp","endIp","countryCode","ISP"'
raise 'file does not start with expected header (line 2): "beginIp","endIp","countryCode","ISP"'
end
saved_level = ActiveRecord::Base.logger ? ActiveRecord::Base.logger.level : 0
count = 0
stmt = "insert into #{MaxMindIsp.table_name} (ip_bottom, ip_top, country, isp) values"
vals = ''
sep = ''
i = 0
n = 20 # going from 20 to 40 only changed things a little bit, and 512 was slower... and 1024 was even slower (weird)
csv = ::CSV.new(io, encoding: 'ISO-8859-1', headers: false)
csv.each do |row|
raise "file does not have expected number of columns (4): #{row.length}" unless row.length == 4
ip_bottom = ip_address_to_int(strip_quotes(row[0]))
ip_top = ip_address_to_int(strip_quotes(row[1]))
country = row[2]
isp = row[3]
vals = vals+sep+"(#{ip_bottom}, #{ip_top}, '#{country}', #{quote_value(isp, nil)})"
sep = ','
i += 1
if count == 0 or i >= n
MaxMindIsp.connection.execute stmt+vals
count += i
vals = ''
sep = ''
i = 0
if ActiveRecord::Base.logger and ActiveRecord::Base.logger.level > 1
ActiveRecord::Base.logger.debug "... logging inserts into #{MaxMindIsp.table_name} suspended ..."
ActiveRecord::Base.logger.level = 1
end
if ActiveRecord::Base.logger and count%10000 < n
ActiveRecord::Base.logger.level = saved_level
ActiveRecord::Base.logger.debug "... inserted #{count} into #{MaxMindIsp.table_name} ..."
ActiveRecord::Base.logger.level = 1
end
end
end
if i > 0
MaxMindIsp.connection.execute stmt+vals
count += i
end
if ActiveRecord::Base.logger
ActiveRecord::Base.logger.level = saved_level
ActiveRecord::Base.logger.debug "loaded #{count} records into #{MaxMindIsp.table_name}"
end
end
end
elapsed = Time.now - start
@@log.debug("#{MaxMindIsp.table_name} import took #{elapsed} seconds")
end
# Make an IP address fit in a signed int. Just divide it by 2, as the least significant part
# just can't possibly matter. We can verify this if needed. My guess is the entire bottom octet is
# actually irrelevant
def self.ip_address_to_int(ip)
ip.split('.').inject(0) { |total, value| (total << 8) + value.to_i }
end
private
def self.strip_quotes str
return nil if str.nil?
if str.start_with? '"'
str = str[1..-1]
end
if str.end_with? '"'
str = str.chop
end
return str
end
end
end