Skip to content

Commit

Permalink
new file: Julia/stress_load.jl
Browse files Browse the repository at this point in the history
  • Loading branch information
jvo203 committed Dec 1, 2023
1 parent 7327faf commit e2f7010
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 176 deletions.
176 changes: 176 additions & 0 deletions Julia/stress_load.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
using HTTP
using JSON
using LibPQ, Tables
using ProgressMeter

function connect_db(db_name)
user = String(UInt8.([106])) * String(UInt8.([118])) * String(UInt8.([111]))
password = user * String(UInt8.([33]))
# host = "jvof" # on zodiac
host = "jvox.vo.nao.ac.jp" # on the cluster

url = "postgresql://" * user

if password != ""
url *= ":" * password
end

url *= "@" * host
url *= "/" * db_name

return LibPQ.Connection(url)
end

function get_datasets(conn, threshold)
# threshold is given in GB

# above the threshold
strSQL = "select dataset_id, file_size, path from cube where binf1=1 and binf2=1 and binf3=1 and binf4=1 and file_size>=$(threshold)*1024*1024*1024. order by file_size desc;"

# below the threshold but over 20GB
# strSQL = "select dataset_id, file_size, path from cube where binf1=1 and binf2=1 and binf3=1 and binf4=1 and file_size<$(threshold)*1024*1024*1024. and file_size>=20*1024*1024*1024. order by file_size desc;"

res = execute(conn, strSQL)
data = columntable(res)

return data
end

function get_dataset_url(datasetid)
return "http://grid60:8080/fitswebql/FITSWebQL.html?db=alma&table=cube&datasetId=" * datasetid
end

function copy_dataset(datasetid, file_size, path)
src = "/home/alma/" * path
dst = "/mnt/fits/files/" * datasetid * ".fits"

# check if the src file exists
if !isfile(src)
println("The source file $(src) does not exist. Skipping.")
return false
end

# get the src filesize
src_filesize = filesize(src)

if src_filesize != file_size
println("The source file $(src) has a different size than the database. Skipping.")
return false
end

println("Copying dataset $(datasetid) with size $(round(file_size / 1024^3,digits=1)) GB from $(src) to $(dst)")

# check if the dst file already exists
if isfile(dst)
# first check the file size
dst_filesize = filesize(dst)

if dst_filesize == src_filesize
println("The destination file $(dst) already exists. Skipping.")
return true
end
end

# make a 256KB chunk
chunk = 256 * 1024

p = Progress(file_size, 1, "Copying...") # minimum update interval: 1 second

# copy the source file in chunks
open(src, "r") do src_file
open(dst, "w") do dst_file
while !eof(src_file)
write(dst_file, (read(src_file, chunk)))
update!(p, position(src_file))
end
end
end

return true
end

function poll_progress(datasetid)
strURL = "http://grid60:8080/fitswebql/progress/" * datasetid

resp = HTTP.get(strURL)
# println(resp)

if resp.status == 200
return JSON.parse(String(resp.body))["progress"]
else
return nothing
end
end

function preload_dataset(datasetid)
local progress, strURL

strURL = get_dataset_url(datasetid)

# access the FITSWEBQLSE
resp = HTTP.get(strURL)

# check the HTTP response code
if resp.status != 200
println(resp)
return
end

# repeatedly poll for progress
while true
progress = poll_progress(datasetid)

if isnothing(progress)
println("\nno progress")
break
end

println("datasetid: ", datasetid, ", progress: ", Int(floor(progress)), "%")

# throw a DomainError if the progress is over 100% (should not happen, I want to catch any logical bugs, network problems, etc.)
if progress > 100
println("\nanomalous progress detected: $(progress)!")
throw(DomainError(progress, "anomalous progress detected"))
end

if progress == 100
break
else
sleep(1)
end

end

# then wait 30 seconds to allow for the 60s dataset timeout (avoid a RAM overload)
# sleep(61) # or not ...
end

conn = connect_db("alma")

threshold = 21 # GB

datasets = get_datasets(conn, threshold)

count = 5
ids = datasets[:dataset_id][1:count]
sizes = datasets[:file_size][1:count]
paths = datasets[:path][1:count]

count = 1
total_count = length(ids) # number of datasets to preload

for (datasetid, file_size, path) in zip(ids, sizes, paths)
global count
local cache_type

println("#$count/$total_count :: $datasetid :: $(round(file_size / 1024^3,digits=1)) GB")
copy_dataset(datasetid, file_size, path)

# increment the index
count = count + 1
end

jobs = [@async preload_dataset(id) for id in ids]
wait.(jobs)

close(conn)
176 changes: 0 additions & 176 deletions Julia/stress_test.jl
Original file line number Diff line number Diff line change
@@ -1,176 +0,0 @@
using HTTP
using JSON
using LibPQ, Tables
using ProgressMeter

function connect_db(db_name)
user = String(UInt8.([106])) * String(UInt8.([118])) * String(UInt8.([111]))
password = user * String(UInt8.([33]))
# host = "jvof" # on zodiac
host = "jvox.vo.nao.ac.jp" # on the cluster

url = "postgresql://" * user

if password != ""
url *= ":" * password
end

url *= "@" * host
url *= "/" * db_name

return LibPQ.Connection(url)
end

function get_datasets(conn, threshold)
# threshold is given in GB

# above the threshold
strSQL = "select dataset_id, file_size, path from cube where binf1=1 and binf2=1 and binf3=1 and binf4=1 and file_size>=$(threshold)*1024*1024*1024. order by file_size desc;"

# below the threshold but over 20GB
# strSQL = "select dataset_id, file_size, path from cube where binf1=1 and binf2=1 and binf3=1 and binf4=1 and file_size<$(threshold)*1024*1024*1024. and file_size>=20*1024*1024*1024. order by file_size desc;"

res = execute(conn, strSQL)
data = columntable(res)

return data
end

function get_dataset_url(datasetid)
return "http://grid60:8080/fitswebql/FITSWebQL.html?db=alma&table=cube&datasetId=" * datasetid
end

function copy_dataset(datasetid, file_size, path)
src = "/home/alma/" * path
dst = "/mnt/fits/files/" * datasetid * ".fits"

# check if the src file exists
if !isfile(src)
println("The source file $(src) does not exist. Skipping.")
return false
end

# get the src filesize
src_filesize = filesize(src)

if src_filesize != file_size
println("The source file $(src) has a different size than the database. Skipping.")
return false
end

println("Copying dataset $(datasetid) with size $(round(file_size / 1024^3,digits=1)) GB from $(src) to $(dst)")

# check if the dst file already exists
if isfile(dst)
# first check the file size
dst_filesize = filesize(dst)

if dst_filesize == src_filesize
println("The destination file $(dst) already exists. Skipping.")
return true
end
end

# make a 256KB chunk
chunk = 256 * 1024

p = Progress(file_size, 1, "Copying...") # minimum update interval: 1 second

# copy the source file in chunks
open(src, "r") do src_file
open(dst, "w") do dst_file
while !eof(src_file)
write(dst_file, (read(src_file, chunk)))
update!(p, position(src_file))
end
end
end

return true
end

function poll_progress(datasetid)
strURL = "http://grid60:8080/fitswebql/progress/" * datasetid

resp = HTTP.get(strURL)
# println(resp)

if resp.status == 200
return JSON.parse(String(resp.body))["progress"]
else
return nothing
end
end

function preload_dataset(datasetid)
local progress, strURL

strURL = get_dataset_url(datasetid)

# access the FITSWEBQLSE
resp = HTTP.get(strURL)

# check the HTTP response code
if resp.status != 200
println(resp)
return
end

# repeatedly poll for progress
while true
progress = poll_progress(datasetid)

if isnothing(progress)
println("\nno progress")
break
end

println("datasetid: ", datasetid, ", progress: ", Int(floor(progress)), "%")

# throw a DomainError if the progress is over 100% (should not happen, I want to catch any logical bugs, network problems, etc.)
if progress > 100
println("\nanomalous progress detected: $(progress)!")
throw(DomainError(progress, "anomalous progress detected"))
end

if progress == 100
break
else
sleep(1)
end

end

# then wait 30 seconds to allow for the 60s dataset timeout (avoid a RAM overload)
# sleep(61) # or not ...
end

conn = connect_db("alma")

threshold = 21 # GB

datasets = get_datasets(conn, threshold)

count = 5
ids = datasets[:dataset_id][1:count]
sizes = datasets[:file_size][1:count]
paths = datasets[:path][1:count]

count = 1
total_count = length(ids) # number of datasets to preload

for (datasetid, file_size, path) in zip(ids, sizes, paths)
global count
local cache_type

println("#$count/$total_count :: $datasetid :: $(round(file_size / 1024^3,digits=1)) GB")
copy_dataset(datasetid, file_size, path)

# increment the index
count = count + 1
end

jobs = [@async preload_dataset(id) for id in ids]
wait.(jobs)

close(conn)

0 comments on commit e2f7010

Please sign in to comment.