Skip to content

Commit 756fb98

Browse files
committed
add IDHash and use it after a pHash match
1 parent a9127c3 commit 756fb98

22 files changed

+264
-86
lines changed

Gemfile

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ gem 'image_processing', '~> 1.13', require: %w[ ruby-vips image_processing ]
4848
ENV['PHASHION_USE_GITHUB_SRC'] ?
4949
gem('phashion', '~> 1.2', github: 'ryoga-chan/phashion') :
5050
gem('phashion', '~> 1.2', source: 'https://rubygems.pkg.github.com/ryoga-chan')
51+
gem 'dhash-vips', '~> 0.2' # dHash and IDHash for image similarity -- https://github.com/Nakilon/dhash-vips
5152

5253
#gem "solid_cache" # database-backed adapter for Rails.cache
5354
#gem "solid_queue" # database-backed adapter for Active Job

app/controllers/doujinshi_controller.rb

+2-2
Original file line numberDiff line numberDiff line change
@@ -481,8 +481,8 @@ def search_cover
481481
end
482482

483483
if cover_hash
484-
CoverMatchingJob.perform_now cover_hash
485-
return redirect_to(hash: cover_hash, format: params[:format])
484+
CoverMatchingJob.perform_now cover_hash[:phash], cover_hash[:idhash]
485+
return redirect_to(hash: cover_hash[:idhash], format: params[:format])
486486
end
487487

488488
fname = File.join(Setting['dir.sorting'], 'cover-search.yml').to_s

app/controllers/home_controller.rb

+2-2
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ def shared_content_receiver
9797
# single file: search by cover image
9898
when params[:images]&.one?
9999
if cover_hash = CoverMatchingJob.hash_image(params[:images].first.path)
100-
CoverMatchingJob.perform_now cover_hash
101-
redirect_to search_cover_doujinshi_path(hash: cover_hash)
100+
CoverMatchingJob.perform_now cover_hash[:phash], cover_hash[:idhash]
101+
redirect_to search_cover_doujinshi_path(hash: cover_hash[:idhash])
102102
else
103103
flash.now[:alert] = "fingerprinting image error"
104104
end

app/controllers/process_controller.rb

+3-3
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def delete_archive_cwd
232232

233233
ProcessArchiveDecompressJob.rm_entry folder: @dname
234234

235-
CoverMatchingJob.rm_results_file @info[:cover_hash]
235+
CoverMatchingJob.rm_results_file @info[:cover_hash][:idhash]
236236

237237
msg = params[:archive_too] == 'true' ? "archive and folder deleted:" : "folder deleted for"
238238
title = @info[:relative_path].one? ? @info[:relative_path].first : @info[:title]
@@ -424,15 +424,15 @@ def edit
424424
cover_path = ProcessArchiveDecompressJob.cover_path @dname, @info
425425
@info[:cover_hash] = CoverMatchingJob.hash_image cover_path
426426
File.open(File.join(@dname, 'info.yml'), 'w'){|f| f.puts @info.to_yaml }
427-
CoverMatchingJob.perform_now @info[:cover_hash]
427+
CoverMatchingJob.perform_now @info[:cover_hash][:phash], @info[:cover_hash][:idhash]
428428
end
429429

430430
@dupes, @dupes_deleted = [], []
431431

432432
# search dupes by cover similarity
433433
# check matching status/results
434434
if @info[:cover_hash].present? && !@info[:cover_results].is_a?(Hash)
435-
cover_matching = CoverMatchingJob.results @info[:cover_hash]
435+
cover_matching = CoverMatchingJob.results @info[:cover_hash][:idhash]
436436
if cover_matching.is_a?(Hash)
437437
@info[:cover_results ] = cover_matching[:results ]
438438
@info[:cover_results_deleted] = cover_matching[:results_deleted]

app/jobs/cover_matching_job.rb

+45-24
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ class CoverMatchingJob < ApplicationJob
33
# http://www.mikeperham.com/2010/05/21/detecting-duplicate-images-with-phashion/
44
# << Our testing showed that 15 bits is a good value to start with, it detected
55
# all duplicates with a minimum of false positives >>
6-
MAX_HAMMING_DISTANCE = 13 # = less than ~20% (13/64) different bits
6+
MAX_HAMMING_DISTANCE_PHASH = 13 # = less than ~20% (13/64) different bits
7+
MAX_HAMMING_DISTANCE_IDHASH = 25 # empirical threshold
78

89
queue_as :search
910

@@ -16,30 +17,26 @@ def self.hash_image_buffer(image_data, hash_only: false)
1617
width: ProcessArchiveCompressJob::THUMB_WIDTH,
1718
height: ProcessArchiveCompressJob::THUMB_HEIGHT
1819

19-
# calculate its pHash
20-
fname = File.join(Setting['dir.sorting'], "#{fkey}.webp").to_s
21-
File.open(fname, 'wb'){|f| f.write thumb[:image].webpsave_buffer }
22-
phash = Kernel.suppress_output{ '%016x' % Phashion::Image.new(fname).fingerprint }
23-
FileUtils.rm_f fname # remove temp image
20+
h = thumb[:image].fingerprints
2421

25-
return phash if hash_only
22+
return h if hash_only
2623

27-
{ phash: phash, landscape: thumb[:landscape],
28-
image: Base64.encode64(thumb[:image].webpsave_buffer).chomp }
24+
h.merge! landscape: thumb[:landscape],
25+
image: Base64.encode64(thumb[:image].webpsave_buffer).chomp
2926
end # self.hash_image_buffer
3027

3128
def self.hash_image(image_path)
3229
result = hash_image_buffer File.binread(image_path)
3330

3431
# create metadata + embedded image
35-
File.atomic_write(File.join(Setting['dir.sorting'], "#{result[:phash]}.yml").to_s) do |f|
32+
File.atomic_write(File.join(Setting['dir.sorting'], "#{result[:idhash]}.yml").to_s) do |f|
3633
f.puts(result.merge({
3734
status: :comparing,
3835
started_at: Time.now,
3936
}).to_yaml)
4037
end
4138

42-
result[:phash]
39+
result
4340
end # self.hash_image
4441

4542
# return final results and delete temp file after matching completed
@@ -58,14 +55,15 @@ def self.rm_results_file(image_hash)
5855
FileUtils.rm_f File.join(Setting['dir.sorting'], "#{image_hash}.yml").to_s
5956
end # self.rm_results_file
6057

61-
def self.find(model, phash, max_distance: MAX_HAMMING_DISTANCE, from_id: 0)
58+
def self.find(model, phash, idhash, from_id: 0)
6259
# https://stackoverflow.com/questions/2281580/is-there-any-way-to-convert-an-integer-3-in-decimal-form-to-its-binary-equival/2310694#2310694
6360
# https://stackoverflow.com/questions/49601249/string-to-binary-and-back-using-pure-sqlite
6461
# GENERATE TERMS: puts (0..63).map{|i| "(x>>#{i.to_s.rjust 2}&1)" }.each_slice(5).map{|s| s.join(' + ') }.join(" +\n")
6562
# NOTE: a XOR b = (a|b)-(a&b) = (~(a&b))&(a|b)
6663
query = <<~SQL
6764
SELECT
6865
id
66+
, cover_idhash
6967
, (x &1) + (x>> 1&1) + (x>> 2&1) + (x>> 3&1) + (x>> 4&1) +
7068
(x>> 5&1) + (x>> 6&1) + (x>> 7&1) + (x>> 8&1) + (x>> 9&1) +
7169
(x>>10&1) + (x>>11&1) + (x>>12&1) + (x>>13&1) + (x>>14&1) +
@@ -81,38 +79,61 @@ def self.find(model, phash, max_distance: MAX_HAMMING_DISTANCE, from_id: 0)
8179
(x>>60&1) + (x>>61&1) + (x>>62&1) + (x>>63&1) AS hamming_distance
8280
FROM (
8381
SELECT id
84-
, (~(a&b))&(a|b) AS x -- x = a XOR b
82+
, cover_idhash
83+
, (~(a&b))&(a|b) AS x -- x = a XOR b
8584
FROM (
8685
SELECT id
87-
, 0x#{phash} AS a
88-
, cover_phash AS b
86+
, #{phash.to_i} AS a
87+
, cover_phash AS b
88+
, cover_idhash
8989
FROM #{model.table_name}
90-
WHERE cover_phash IS NOT NULL
90+
WHERE cover_phash IS NOT NULL
91+
AND cover_idhash IS NOT NULL
9192
AND id > #{from_id.to_i}
9293
)
9394
)
94-
WHERE hamming_distance < #{max_distance.to_i}
95+
WHERE hamming_distance < #{MAX_HAMMING_DISTANCE_PHASH}
9596
ORDER BY hamming_distance
9697
LIMIT 10
9798
SQL
9899

99-
model.
100-
find_by_sql(query).
101-
inject({}){|h, d| h.merge d.id => ((1 - d.hamming_distance.to_f / 64) * 100).round }
100+
idhash = idhash.to_i(16) if idhash.is_a?(String)
101+
102+
model.find_by_sql(query).inject({}) do |h, d|
103+
# compute the more accurate IDHash distance
104+
DHashVips::IDHash.distance3(d.cover_idhash.to_i(16), idhash) < MAX_HAMMING_DISTANCE_IDHASH \
105+
? h.merge!(d.id => ((1 - d.hamming_distance.to_f / 64) * 100).round)
106+
: h
107+
end
102108
end # self.find
103109

110+
# phash* is a signed integer, idhash is a hexadecimal string
111+
# returns nil or the similarity percentage of the matching
112+
def self.similarity(phash1, phash2, idhash1, idhash2)
113+
d1 = Phashion.hamming_distance [phash1].pack('q').unpack('Q').first,
114+
[phash2].pack('q').unpack('Q').first
115+
116+
if d1 < MAX_HAMMING_DISTANCE_PHASH
117+
# compute the more accurate IDHash distance
118+
d2 = DHashVips::IDHash.distance3 idhash1.to_i(16), idhash2.to_i(16)
119+
return ((1 - d1.to_f / 64) * 100).round if d2 < MAX_HAMMING_DISTANCE_IDHASH
120+
end
121+
122+
nil
123+
end # self.similarity
124+
104125
# read image data from temp file and do a matching against all saved doujinshi
105126
# by computing hamming distance between pHashes
106-
def perform(image_hash, max_distance: MAX_HAMMING_DISTANCE)
107-
fname = File.join(Setting['dir.sorting'], "#{image_hash}.yml").to_s
127+
def perform(phash, idhash)
128+
fname = File.join(Setting['dir.sorting'], "#{idhash}.yml").to_s
108129
info = YAML.unsafe_load_file fname
109130

110131
# write results
111132
info.merge! \
112133
status: :completed,
113134
finished_at: Time.now,
114-
results: CoverMatchingJob.find(Doujin , image_hash, max_distance: max_distance),
115-
results_deleted: CoverMatchingJob.find(DeletedDoujin, image_hash, max_distance: max_distance)
135+
results: CoverMatchingJob.find(Doujin , phash, idhash),
136+
results_deleted: CoverMatchingJob.find(DeletedDoujin, phash, idhash)
116137

117138
File.atomic_write(fname){|f| f.puts info.to_yaml }
118139
end # perform

app/jobs/process_archive_compress_job.rb

+1-1
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def perform(src_dir)
167167
File.atomic_write(File.join(src_dir, 'finalize.perc')){|f| f.write perc.round(2) }
168168
end # Doujin.transaction
169169

170-
CoverMatchingJob.rm_results_file info[:cover_hash]
170+
CoverMatchingJob.rm_results_file info[:cover_hash][:idhash]
171171
rescue
172172
info[:finalize_error ] = $!.to_s
173173
info[:finalize_backtrace] = $!.backtrace

app/jobs/process_batch_inspect_job.rb

+6-4
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@ def perform(hash)
2121
info[:files][name] = image_entries.map &:name
2222

2323
if cover = image_entries.first # extract cover image
24-
info[:thumbs][name] = CoverMatchingJob.
25-
hash_image_buffer cover.get_input_stream.read
24+
info[:thumbs][name] = CoverMatchingJob.hash_image_buffer cover.get_input_stream.read
2625
end
2726
end
2827

@@ -35,8 +34,11 @@ def perform(hash)
3534
matching_found = false
3635

3736
groups.keys.each do |g|
38-
hd = Phashion.hamming_distance info[:thumbs][f][:phash].to_i(16), info[:thumbs][g][:phash].to_i(16)
39-
if hd < CoverMatchingJob::MAX_HAMMING_DISTANCE
37+
match_found = CoverMatchingJob.similarity \
38+
info[:thumbs][f][:phash ], info[:thumbs][g][:phash ],
39+
info[:thumbs][f][:idhash], info[:thumbs][g][:idhash]
40+
41+
if match_found
4042
groups[g] << f
4143
matching_found = true
4244
break

app/jobs/process_batch_job.rb

+2-2
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,8 @@ def add_error(results, key, msg, opts)
126126
# run cover image hash matching
127127
cover_path = ProcessArchiveDecompressJob.cover_path dname, info
128128
info[:cover_hash] = CoverMatchingJob.hash_image cover_path
129-
CoverMatchingJob.perform_now info[:cover_hash]
130-
cover_matching = CoverMatchingJob.results info[:cover_hash]
129+
CoverMatchingJob.perform_now info[:cover_hash][:phash], info[:cover_hash][:idhash]
130+
cover_matching = CoverMatchingJob.results info[:cover_hash][:idhash]
131131
info[:cover_results] = cover_matching[:results]
132132
info[:cover_status ] = cover_matching[:status]
133133
File.open(info_fname, 'w'){|f| f.puts info.to_yaml }

app/jobs/process_index_group_job.rb

+3-3
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,21 @@ def perform(*args)
2424
rel = ProcessableDoujin.where.not(cover_phash: nil).where("id >= ?", last_id.to_i)
2525
num_entries = rel.count
2626
print_info_freq = [num_entries / 100 + 1, 10].max
27-
rel.select("id, PRINTF('%016x', cover_phash) AS cover_phash_hex").find_each.with_index do |pd, i|
27+
rel.select("id, cover_phash, cover_idhash").find_each.with_index do |pd, i|
2828
break if Rails.env.development? && i >= DEVEL_LIMIT
2929

3030
ris = self.class.progress_update(step: (i+1), steps: num_entries, msg: 'comparing covers')
3131
Rails.logger.info(ris) if i % print_info_freq == 0
3232

3333
ProcessableDoujin.transaction do
34-
CoverMatchingJob.find(ProcessableDoujin, pd.cover_phash_hex, from_id: (last_id ? 0 : pd.id)).each do |id, perc|
34+
CoverMatchingJob.find(ProcessableDoujin, pd.cover_phash, pd.cover_idhash, from_id: (last_id ? 0 : pd.id)).each do |id, perc|
3535
next if pd.id == id
3636
ids = [pd.id, id].sort
3737
pdd = ProcessableDoujinDupe.find_or_initialize_by(pd_parent_id: ids[0], pd_child_id: ids[1])
3838
pdd.update likeness: perc
3939
end # each dupe
4040

41-
CoverMatchingJob.find(Doujin, pd.cover_phash_hex).each do |id, perc|
41+
CoverMatchingJob.find(Doujin, pd.cover_phash, pd.cover_idhash).each do |id, perc|
4242
pdd = ProcessableDoujinDupe.find_or_initialize_by(pd_parent_id: pd.id, doujin_id: id)
4343
pdd.update likeness: perc
4444
end # each dupe

app/jobs/process_index_refresh_job.rb

+15-20
Original file line numberDiff line numberDiff line change
@@ -74,35 +74,30 @@ def self.rm_entry(path_or_id, track: false, rm_zip: false, merged: false, doujin
7474

7575
if File.exist?(zip_path)
7676
if track
77-
cover_hash = nil
77+
dd = DeletedDoujin.new
7878

7979
# count images and other files
80-
file_counters = { num_images: 0, num_files: 0 }
8180
Zip::File.open(zip_path) do |zip|
82-
zip.entries.sort_by_method(:name).each do |e|
83-
next unless e.file?
81+
entries = zip.split_entries(sort: true)
8482

85-
is_image = e.name.is_image_filename?
83+
dd.num_images = entries[:images].size
84+
dd.num_files = entries[:files ].size
8685

87-
# generate phash for the first image file
88-
if cover_hash.nil? && is_image
89-
cover_hash = CoverMatchingJob.hash_image_buffer(e.get_input_stream.read)[:phash]
90-
end
91-
92-
file_counters[is_image ? :num_images : :num_files] += 1
86+
# generate phash for the first image file
87+
if cover = entries[:images].first
88+
h = CoverMatchingJob.hash_image_buffer(cover.get_input_stream.read, hash_only: true)
89+
dd.cover_phash = h[:phash]
90+
dd.cover_idhash = h[:idhash]
9391
end
9492
end
9593

9694
# track deletion
97-
name = pd.name.tr(File::SEPARATOR, ' ')
98-
dd = DeletedDoujin.create! file_counters.merge({
99-
name: name,
100-
name_kakasi: name.to_romaji,
101-
size: File.size(zip_path),
102-
merged: merged,
103-
doujin_id: doujin_id,
104-
})
105-
dd.cover_fingerprint! cover_hash if cover_hash.present?
95+
name = pd.name.tr File::SEPARATOR, ' '
96+
dd.update! name: name,
97+
name_kakasi: name.to_romaji,
98+
size: File.size(zip_path),
99+
merged: merged,
100+
doujin_id: doujin_id
106101
end # if track
107102

108103
# remove file from disk

app/models/deleted_doujin.rb

+2-3
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,8 @@ def self.search(terms)
3636
rel.order(Arel.sql "COALESCE(NULLIF(alt_name_kakasi, ''), NULLIF(name_kakasi, ''))")
3737
end # self.search
3838

39-
def cover_fingerprint!(fp)
39+
def cover_fingerprint!(h)
4040
raise :record_not_persisted unless persisted?
41-
self.class.connection.execute \
42-
%Q(UPDATE #{self.class.table_name} SET cover_phash = 0x#{fp} WHERE id = #{id})
41+
update! cover_phash: h[:phash], cover_idhash: h[:idhash]
4342
end # cover_fingerprint!
4443
end

app/models/doujin.rb

+3-5
Original file line numberDiff line numberDiff line change
@@ -231,10 +231,8 @@ def cover_fingerprint = CoverMatchingJob.hash_image_buffer(File.binread(thumb_di
231231

232232
def cover_fingerprint!
233233
raise :record_not_persisted unless persisted?
234-
fp = cover_fingerprint
235-
self.class.connection.execute \
236-
%Q(UPDATE #{self.class.table_name} SET cover_phash = 0x#{fp} WHERE id = #{id})
237-
fp
234+
h = cover_fingerprint
235+
update! cover_phash: h[:phash], cover_idhash: h[:idhash]
238236
end # cover_fingerprint!
239237

240238
# next page numer for `/doujinshi/ID/read` action
@@ -308,7 +306,7 @@ def save_deletion_data
308306
return unless @save_deletion_data
309307

310308
fname = file_dl_name omit_ext: true
311-
attrs = attributes.slice *%w[ size num_images num_files cover_phash ]
309+
attrs = attributes.slice *%w[ size num_images num_files cover_phash cover_idhash ]
312310

313311
DeletedDoujin.create attrs.merge({
314312
doujin_id: id,

app/models/processable_doujin.rb

+7-10
Original file line numberDiff line numberDiff line change
@@ -76,27 +76,24 @@ def cover_fingerprint
7676
fname = file_path full: true
7777
return unless File.exist?(fname)
7878

79-
phash = nil
79+
h = {}
8080

8181
Zip::File.open(fname) do |zip|
8282
zip_images = zip.image_entries(sort: true)
83+
@images = zip_images.size
8384

84-
if zip_images.any?
85-
phash = CoverMatchingJob.hash_image_buffer zip_images.first.get_input_stream.read, hash_only: true
86-
@images = zip_images.size
85+
if cover = zip_images.first
86+
h = CoverMatchingJob.hash_image_buffer cover.get_input_stream.read, hash_only: true
8787
end
8888
end
8989

90-
phash
90+
h
9191
end # cover_fingerprint
9292

9393
def cover_fingerprint!
9494
raise :record_not_persisted unless persisted?
95-
fp = cover_fingerprint
96-
self.class.connection.execute \
97-
%Q(UPDATE #{self.class.table_name} SET cover_phash = 0x#{fp} WHERE id = #{id})
98-
update images: @images
99-
fp
95+
h = cover_fingerprint
96+
update! cover_phash: h[:phash], cover_idhash: h[:idhash], images: @images
10097
end # cover_fingerprint!
10198

10299
def generate_preview

config/initializers/logging.eval

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ if ENV['PUMA_DAEMON'] && Rails.env.production?
77
logger_args = [File.join(Dir.tmpdir, "ruby:djmngr-server.log"), 2]
88
end
99

10-
unless Rails.const_defined?(:Console) || defined?(Rake)
10+
unless Rails.const_defined?(:Console) || # bin/rails console
11+
Rails.const_defined?(:Generators) || # bin/rails generate
12+
defined?(Rake) # bin/rails x:y:z
1113
puts "* Logging to #{logger_args.first.inspect}"
1214
end
1315

0 commit comments

Comments
 (0)