Scraping Amazon S3 files with Ruby
Below is a pretty simple ruby script for parsing files uploaded to an Amazon S3 bucket and inserting the file's information into a MySQL database.
#!/usr/local/bin/ruby</p>
<p>require 'rubygems'
require 'aws/s3'
require 'mysql'
require 'lockfile'</p>
<p>begin
Lockfile.new('/tmp/scraper.lock', :retries => 0) do # Setup the lock file</p>
<p> my = Mysql::new("localhost", "nfssupport", "password", "upload_files") # Setup the MySQL connection
q = my.query("Select FileName, FileModified from files") # Setup the MySQL query
db = Array.new # Create the array for the MySQL file list
q.each_hash do |f| # For each file in the DB
db << "#{f['FileName']} - #{f['FileModified']}" # Put it into the array
end</p>
<p> AWS::S3::Base.establish_connection!( # Create the S3 connection
:access_key_id => 'access_key_id',
:secret_access_key => 'secret_access_key'
)
files = AWS::S3::Bucket.objects( # Put the S3 files into the files array
'domain-tld',
:prefix => 'uploads/user@domain.tld/uuid'
)</p>
<p> files.each do |file| # For each S3 file
t = Time.parse(file.about['last-modified'])
file_date = "#{t.year}#{sprintf('%02d',t.month)}#{sprintf('%02d',t.day)}"
file_name = file.key.split('/',4).last.gsub(/^(\d+\-)/,'')
file_modified = file.about['last-modified'].split(' ',5).last</p>
<p> if !db.include? "#{file_name} - #{file_modified}" # If file not in DB
st = my.prepare("insert into files (FileName, FileDate, FileModified, FileLink, FileDescription, FileSize) VALUES (?, ?, ?, ?, ?, ?)
")
st.execute(file_name, file_date, file_modified, "https://uploads.domain.com/download/#{file.key.split('/',3).last}", file.metada
ta['x-amz-meta-description'], file.about['content-length']) # Insert it
st.close
end
end
end</p>
<p>rescue Lockfile::MaxTriesLockError => e
puts "Scraper is already running!"
end