@@ -3,7 +3,8 @@ class CoverMatchingJob < ApplicationJob
3
3
# http://www.mikeperham.com/2010/05/21/detecting-duplicate-images-with-phashion/
4
4
# << Our testing showed that 15 bits is a good value to start with, it detected
5
5
# all duplicates with a minimum of false positives >>
6
- MAX_HAMMING_DISTANCE = 13 # = less than ~20% (13/64) different bits
6
+ MAX_HAMMING_DISTANCE_PHASH = 13 # = less than ~20% (13/64) different bits
7
+ MAX_HAMMING_DISTANCE_IDHASH = 25 # empirical threshold
7
8
8
9
queue_as :search
9
10
@@ -16,30 +17,26 @@ def self.hash_image_buffer(image_data, hash_only: false)
16
17
width : ProcessArchiveCompressJob ::THUMB_WIDTH ,
17
18
height : ProcessArchiveCompressJob ::THUMB_HEIGHT
18
19
19
- # calculate its pHash
20
- fname = File . join ( Setting [ 'dir.sorting' ] , "#{ fkey } .webp" ) . to_s
21
- File . open ( fname , 'wb' ) { |f | f . write thumb [ :image ] . webpsave_buffer }
22
- phash = Kernel . suppress_output { '%016x' % Phashion ::Image . new ( fname ) . fingerprint }
23
- FileUtils . rm_f fname # remove temp image
20
+ h = thumb [ :image ] . fingerprints
24
21
25
- return phash if hash_only
22
+ return h if hash_only
26
23
27
- { phash : phash , landscape : thumb [ :landscape ] ,
28
- image : Base64 . encode64 ( thumb [ :image ] . webpsave_buffer ) . chomp }
24
+ h . merge! landscape : thumb [ :landscape ] ,
25
+ image : Base64 . encode64 ( thumb [ :image ] . webpsave_buffer ) . chomp
29
26
end # self.hash_image_buffer
30
27
31
28
def self . hash_image ( image_path )
32
29
result = hash_image_buffer File . binread ( image_path )
33
30
34
31
# create metadata + embedded image
35
- File . atomic_write ( File . join ( Setting [ 'dir.sorting' ] , "#{ result [ :phash ] } .yml" ) . to_s ) do |f |
32
+ File . atomic_write ( File . join ( Setting [ 'dir.sorting' ] , "#{ result [ :idhash ] } .yml" ) . to_s ) do |f |
36
33
f . puts ( result . merge ( {
37
34
status : :comparing ,
38
35
started_at : Time . now ,
39
36
} ) . to_yaml )
40
37
end
41
38
42
- result [ :phash ]
39
+ result
43
40
end # self.hash_image
44
41
45
42
# return final results and delete temp file after matching completed
@@ -58,14 +55,15 @@ def self.rm_results_file(image_hash)
58
55
FileUtils . rm_f File . join ( Setting [ 'dir.sorting' ] , "#{ image_hash } .yml" ) . to_s
59
56
end # self.rm_results_file
60
57
61
- def self . find ( model , phash , max_distance : MAX_HAMMING_DISTANCE , from_id : 0 )
58
+ def self . find ( model , phash , idhash , from_id : 0 )
62
59
# https://stackoverflow.com/questions/2281580/is-there-any-way-to-convert-an-integer-3-in-decimal-form-to-its-binary-equival/2310694#2310694
63
60
# https://stackoverflow.com/questions/49601249/string-to-binary-and-back-using-pure-sqlite
64
61
# GENERATE TERMS: puts (0..63).map{|i| "(x>>#{i.to_s.rjust 2}&1)" }.each_slice(5).map{|s| s.join(' + ') }.join(" +\n")
65
62
# NOTE: a XOR b = (a|b)-(a&b) = (~(a&b))&(a|b)
66
63
query = <<~SQL
67
64
SELECT
68
65
id
66
+ , cover_idhash
69
67
, (x &1) + (x>> 1&1) + (x>> 2&1) + (x>> 3&1) + (x>> 4&1) +
70
68
(x>> 5&1) + (x>> 6&1) + (x>> 7&1) + (x>> 8&1) + (x>> 9&1) +
71
69
(x>>10&1) + (x>>11&1) + (x>>12&1) + (x>>13&1) + (x>>14&1) +
@@ -81,38 +79,61 @@ def self.find(model, phash, max_distance: MAX_HAMMING_DISTANCE, from_id: 0)
81
79
(x>>60&1) + (x>>61&1) + (x>>62&1) + (x>>63&1) AS hamming_distance
82
80
FROM (
83
81
SELECT id
84
- , (~(a&b))&(a|b) AS x -- x = a XOR b
82
+ , cover_idhash
83
+ , (~(a&b))&(a|b) AS x -- x = a XOR b
85
84
FROM (
86
85
SELECT id
87
- , 0x#{ phash } AS a
88
- , cover_phash AS b
86
+ , #{ phash . to_i } AS a
87
+ , cover_phash AS b
88
+ , cover_idhash
89
89
FROM #{ model . table_name }
90
- WHERE cover_phash IS NOT NULL
90
+ WHERE cover_phash IS NOT NULL
91
+ AND cover_idhash IS NOT NULL
91
92
AND id > #{ from_id . to_i }
92
93
)
93
94
)
94
- WHERE hamming_distance < #{ max_distance . to_i }
95
+ WHERE hamming_distance < #{ MAX_HAMMING_DISTANCE_PHASH }
95
96
ORDER BY hamming_distance
96
97
LIMIT 10
97
98
SQL
98
99
99
- model .
100
- find_by_sql ( query ) .
101
- inject ( { } ) { |h , d | h . merge d . id => ( ( 1 - d . hamming_distance . to_f / 64 ) * 100 ) . round }
100
+ idhash = idhash . to_i ( 16 ) if idhash . is_a? ( String )
101
+
102
+ model . find_by_sql ( query ) . inject ( { } ) do |h , d |
103
+ # compute the more accurate IDHash distance
104
+ DHashVips ::IDHash . distance3 ( d . cover_idhash . to_i ( 16 ) , idhash ) < MAX_HAMMING_DISTANCE_IDHASH \
105
+ ? h . merge! ( d . id => ( ( 1 - d . hamming_distance . to_f / 64 ) * 100 ) . round )
106
+ : h
107
+ end
102
108
end # self.find
103
109
110
+ # phash* is a signed integer, idhash is a hexadecimal string
111
+ # returns nil or the similarity percentage of the matching
112
+ def self . similarity ( phash1 , phash2 , idhash1 , idhash2 )
113
+ d1 = Phashion . hamming_distance [ phash1 ] . pack ( 'q' ) . unpack ( 'Q' ) . first ,
114
+ [ phash2 ] . pack ( 'q' ) . unpack ( 'Q' ) . first
115
+
116
+ if d1 < MAX_HAMMING_DISTANCE_PHASH
117
+ # compute the more accurate IDHash distance
118
+ d2 = DHashVips ::IDHash . distance3 idhash1 . to_i ( 16 ) , idhash2 . to_i ( 16 )
119
+ return ( ( 1 - d1 . to_f / 64 ) * 100 ) . round if d2 < MAX_HAMMING_DISTANCE_IDHASH
120
+ end
121
+
122
+ nil
123
+ end # self.similarity
124
+
104
125
# read image data from temp file and do a matching against all saved doujinshi
105
126
# by computing hamming distance between pHashes
106
- def perform ( image_hash , max_distance : MAX_HAMMING_DISTANCE )
107
- fname = File . join ( Setting [ 'dir.sorting' ] , "#{ image_hash } .yml" ) . to_s
127
+ def perform ( phash , idhash )
128
+ fname = File . join ( Setting [ 'dir.sorting' ] , "#{ idhash } .yml" ) . to_s
108
129
info = YAML . unsafe_load_file fname
109
130
110
131
# write results
111
132
info . merge! \
112
133
status : :completed ,
113
134
finished_at : Time . now ,
114
- results : CoverMatchingJob . find ( Doujin , image_hash , max_distance : max_distance ) ,
115
- results_deleted : CoverMatchingJob . find ( DeletedDoujin , image_hash , max_distance : max_distance )
135
+ results : CoverMatchingJob . find ( Doujin , phash , idhash ) ,
136
+ results_deleted : CoverMatchingJob . find ( DeletedDoujin , phash , idhash )
116
137
117
138
File . atomic_write ( fname ) { |f | f . puts info . to_yaml }
118
139
end # perform
0 commit comments