|
17 | 17 | {:id id
|
18 | 18 | :words (utils/get-words text stop-words-set)}) tweets))
|
19 | 19 |
|
20 |
| -(defn calc-total [tweets-bags current] |
21 |
| - (let [similarity-threshold 0.5 |
22 |
| - {:keys [words]} (nth tweets-bags current)] |
| 20 | +(defn calc-total [tweets-bags current similarity-threshold] |
| 21 | + (let [{:keys [words]} (nth tweets-bags current)] |
23 | 22 | (loop [acc 0
|
24 | 23 | current (inc current)]
|
25 | 24 | (if (< current (count tweets-bags))
|
|
30 | 29 | acc) (inc current)))
|
31 | 30 | acc))))
|
32 | 31 |
|
33 |
| -(defn reorder-tweets-by-similarity [tweets-bags] |
| 32 | +(defn reorder-tweets-by-similarity [tweets-bags similarity-threshold] |
34 | 33 | (let [weights (loop [weights (transient [])
|
35 | 34 | i 0]
|
36 | 35 | (if (< i (dec (count tweets-bags)))
|
37 |
| - (recur (conj! weights [i (calc-total tweets-bags i)]) (inc i)) |
| 36 | + (recur (conj! weights [i (calc-total tweets-bags i similarity-threshold)]) (inc i)) |
38 | 37 | (persistent! weights)))]
|
39 | 38 | (->> weights
|
40 | 39 | (filter (fn [[id weight]]
|
|
47 | 46 | :else 1)))
|
48 | 47 | (mapv first))))
|
49 | 48 |
|
50 |
| -(defn popular-tweets [tweets {:keys [stop-words hype-count]}] |
51 |
| - (let [tweets-bags (tweets->bags tweets stop-words) |
52 |
| - ordered-indices (reorder-tweets-by-similarity tweets-bags) |
53 |
| - get-tweet (fn [ordered-index] |
54 |
| - (->> (nth ordered-indices ordered-index) |
55 |
| - (nth tweets))) |
56 |
| - get-words (fn [ordered-index] |
57 |
| - (->> (nth ordered-indices ordered-index) |
58 |
| - (nth tweets-bags) |
59 |
| - (:words))) |
60 |
| - similarity-threshold 0.99] |
61 |
| - (prn ordered-indices) |
| 49 | +(defn popular-tweets [tweets {:keys [stop-words hype-count similarity-threshold]}] |
| 50 | + (let [tweets-bags (tweets->bags tweets stop-words) |
| 51 | + ordered-indices (reorder-tweets-by-similarity tweets-bags similarity-threshold) |
| 52 | + get-tweet (fn [tweet-index] |
| 53 | + (nth tweets tweet-index)) |
| 54 | + get-words (fn [ordered-index] |
| 55 | + (->> (nth ordered-indices ordered-index) |
| 56 | + (nth tweets-bags) |
| 57 | + (:words))) |
| 58 | + unique-indices (mapv (fn [_] |
| 59 | + true) ordered-indices)] |
62 | 60 | (if (seq ordered-indices)
|
63 |
| - (loop [popular-tweets (transient [(get-tweet 0)]) |
64 |
| - words (get-words 0) |
65 |
| - index 1] |
66 |
| - (if (< index (count ordered-indices)) |
67 |
| - (let [words2 (get-words index) |
68 |
| - sim (calc-similarity words words2) |
69 |
| - similar (> sim similarity-threshold)] |
70 |
| - (recur (if similar |
71 |
| - popular-tweets |
72 |
| - (conj! popular-tweets (get-tweet index))) |
73 |
| - (if similar |
74 |
| - words |
75 |
| - (get-words index)) |
76 |
| - (inc index))) |
77 |
| - (->> (persistent! popular-tweets) |
| 61 | + (loop [i 0 |
| 62 | + unique-indices unique-indices] |
| 63 | + (if (< i (count ordered-indices)) |
| 64 | + (let [words (get-words i) |
| 65 | + unique-indices (if (nth unique-indices i) |
| 66 | + (loop [j (inc i) |
| 67 | + unique-indices unique-indices] |
| 68 | + (if (< j (count ordered-indices)) |
| 69 | + (let [words2 (get-words j) |
| 70 | + sim (calc-similarity words words2) |
| 71 | + similar (>= sim similarity-threshold)] |
| 72 | + (recur (inc j) (if similar |
| 73 | + (assoc unique-indices j false) |
| 74 | + unique-indices))) |
| 75 | + unique-indices)) |
| 76 | + unique-indices)] |
| 77 | + (recur (inc i) unique-indices)) |
| 78 | + (->> ordered-indices |
| 79 | + (map-indexed (fn [idx tweet-idx] |
| 80 | + [idx tweet-idx])) |
| 81 | + (filter (fn [[idx tweet-idx]] |
| 82 | + (nth unique-indices idx))) |
| 83 | + (map (comp get-tweet second)) |
78 | 84 | (take hype-count))))
|
79 | 85 | [])))
|
0 commit comments