-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'sjfranklin/Add-HTTP-Method'
* sjfranklin/Add-HTTP-Method: taxdump should work with http method Added clarifying comments for some changes Explained the regex, which finds the relevant file name from the raw http response The main difference here is how it pulls the list of URLs, parses them, and the actual download is essentially the same. If user mentions http as an argument, load the http rake. Ruby is quite over my head so this is a very quick-and-dirty fix Load normal or http rakefile depending on option Signed-off-by: Anurag Priyam <anurag08priyam@gmail.com>
- Loading branch information
Showing
2 changed files
with
95 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,20 @@ | ||
#!/usr/bin/env ruby | ||
|
||
require 'rake' | ||
import "#{File.dirname(__FILE__)}/../lib/ncbi-blast-dbs.rake" | ||
|
||
trap :INT do | ||
puts "Quitting ..." | ||
exit! | ||
end | ||
|
||
Rake.application.init 'ncbi-blast-dbs' | ||
Rake.application.load_imports | ||
Rake.application.top_level | ||
if ARGV.include? "http"; | ||
import "#{File.dirname(__FILE__)}/../lib/http-ncbi-blast-dbs.rake" | ||
Rake.application.init 'http-ncbi-blast-dbs' | ||
Rake.application.load_imports | ||
Rake.application.top_level | ||
else; | ||
import "#{File.dirname(__FILE__)}/../lib/ncbi-blast-dbs.rake" | ||
Rake.application.init 'ncbi-blast-dbs' | ||
Rake.application.load_imports | ||
Rake.application.top_level | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
require 'net/http' | ||
require 'uri' | ||
puts "using http-ncbi-dbs-dgs.rake" | ||
# Downloads tarball at the given URL if a local copy does not exist, or if the | ||
# local copy is older than at the given URL, or if the local copy is corrupt. | ||
def download(url, last_to_do) | ||
file = File.basename(url) | ||
|
||
# # Resume an interrupted download or fetch the file for the first time. If | ||
# # the file on the server is newer, then it is downloaded from start. | ||
|
||
sh "wget -Nc --no-verbose #{url}" | ||
# If the local copy is already fully retrieved, then the previous command | ||
# ignores the timestamp. So we check with the server again if the file on | ||
# the server is newer and if so download the new copy. | ||
sh "wget -N --no-verbose #{url}" | ||
sh "wget -Nc --no-verbose #{url}.md5" | ||
sh "wget -N --no-verbose #{url}.md5" | ||
# Immediately download md5 and verify the tarball. Re-download tarball if | ||
# corrupt; extract otherwise. | ||
sh "md5sum -c #{file}.md5" do |matched, _| | ||
if !matched | ||
sh "rm #{file} #{file}.md5"; download(url) | ||
# too many tar instances unzipping the same file clutter the system | ||
elsif file == last_to_do; | ||
sh "tar xfov #{file}" | ||
else | ||
# at least nr and nt tarballs have identical files .?al; unsure of others | ||
sh "tar xfov #{file} --exclude='*.?al' --exclude='taxdb*'" | ||
end | ||
end | ||
end | ||
|
||
|
||
def databases | ||
method = 'https://' | ||
host, dir = 'ftp.ncbi.nlm.nih.gov', 'blast/db' | ||
uri = URI.parse(method + host + "/" + dir + "/") | ||
|
||
response = Net::HTTP.get_response(uri) | ||
body = response.body.split | ||
|
||
array_of_files = [] | ||
body.each do |line| | ||
# regex takes the raw http response, matches lines such as: | ||
# href="tsa_nt.06.tar.gz.md5">tsa_nt.06.tar.gz</a> | ||
# Returns: | ||
# tsa_nt.06.tar.gz | ||
filenames_and_newlines = line[/(^href=".*">)(.*tar.gz|.*md5)(<\/a>)$/, 2] | ||
array_of_files.append(filenames_and_newlines) unless filenames_and_newlines.nil? | ||
end | ||
|
||
# append the full path to file for downstream wget | ||
array_of_files.map! { |string| "".concat("/blast/db/", string ) } | ||
array_of_files. | ||
map { |file| File.join(host, file) }. | ||
select { |file| file.match(/\.tar\.gz$/) }. | ||
group_by { |file| File.basename(file).split('.')[0] } | ||
end | ||
|
||
|
||
# Create user-facing task for each database to drive the download of its | ||
# volumes in parallel. | ||
databases.each do |name, files| | ||
last = { name => files.last } | ||
multitask(name => files.map { |file| task(file) { download(file, last.values.uniq) } }) | ||
end | ||
|
||
# List name of all databases that can be downloaded if executed without | ||
# any arguments. | ||
task :default do | ||
databases | ||
puts databases.keys.push('taxdump').join(', ') | ||
end | ||
|
||
task :taxdump do | ||
download('https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz', "nil") | ||
end | ||
|
||
# Ruby being over my head, this is my quick-and-dirty way to trick it ignoring | ||
# "http" as a task rather than a specification. Happy for an expert to fix it up! | ||
task :http do | ||
puts "using http method" | ||
end |