Skip to content

Commit d169401

Browse files
author
abtv
committed
filtering step in hype-meter
1 parent b541eb4 commit d169401

File tree

6 files changed

+137
-106
lines changed

6 files changed

+137
-106
lines changed

project.clj

+4
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@
8080
:max-hashtags-per-trend "25"
8181
:max-texts-per-request "200"
8282
:max-tweet-count "1000"
83+
:hype-tweet-count "10000"
84+
:similarity-threshold "0.5"
8385
:metrics-timeout-s "100"
8486
:log-path "./logs/tech-radar.log"
8587
:max-log-size-mb "1"
@@ -98,6 +100,8 @@
98100
:max-hashtags-per-trend "25"
99101
:max-texts-per-request "200"
100102
:max-tweet-count "500000"
103+
:hype-tweet-count "10000"
104+
:similarity-threshold "0.5"
101105
:metrics-timeout-s "300"
102106
:log-path "./logs/tech-radar.log"
103107
:max-log-size-mb "1"

src/clj/tech_radar/analytics/hype_meter.clj

+38-32
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@
1717
{:id id
1818
:words (utils/get-words text stop-words-set)}) tweets))
1919

20-
(defn calc-total [tweets-bags current]
21-
(let [similarity-threshold 0.5
22-
{:keys [words]} (nth tweets-bags current)]
20+
(defn calc-total [tweets-bags current similarity-threshold]
21+
(let [{:keys [words]} (nth tweets-bags current)]
2322
(loop [acc 0
2423
current (inc current)]
2524
(if (< current (count tweets-bags))
@@ -30,11 +29,11 @@
3029
acc) (inc current)))
3130
acc))))
3231

33-
(defn reorder-tweets-by-similarity [tweets-bags]
32+
(defn reorder-tweets-by-similarity [tweets-bags similarity-threshold]
3433
(let [weights (loop [weights (transient [])
3534
i 0]
3635
(if (< i (dec (count tweets-bags)))
37-
(recur (conj! weights [i (calc-total tweets-bags i)]) (inc i))
36+
(recur (conj! weights [i (calc-total tweets-bags i similarity-threshold)]) (inc i))
3837
(persistent! weights)))]
3938
(->> weights
4039
(filter (fn [[id weight]]
@@ -47,33 +46,40 @@
4746
:else 1)))
4847
(mapv first))))
4948

50-
(defn popular-tweets [tweets {:keys [stop-words hype-count]}]
51-
(let [tweets-bags (tweets->bags tweets stop-words)
52-
ordered-indices (reorder-tweets-by-similarity tweets-bags)
53-
get-tweet (fn [ordered-index]
54-
(->> (nth ordered-indices ordered-index)
55-
(nth tweets)))
56-
get-words (fn [ordered-index]
57-
(->> (nth ordered-indices ordered-index)
58-
(nth tweets-bags)
59-
(:words)))
60-
similarity-threshold 0.99]
61-
(prn ordered-indices)
49+
(defn popular-tweets [tweets {:keys [stop-words hype-count similarity-threshold]}]
50+
(let [tweets-bags (tweets->bags tweets stop-words)
51+
ordered-indices (reorder-tweets-by-similarity tweets-bags similarity-threshold)
52+
get-tweet (fn [tweet-index]
53+
(nth tweets tweet-index))
54+
get-words (fn [ordered-index]
55+
(->> (nth ordered-indices ordered-index)
56+
(nth tweets-bags)
57+
(:words)))
58+
unique-indices (mapv (fn [_]
59+
true) ordered-indices)]
6260
(if (seq ordered-indices)
63-
(loop [popular-tweets (transient [(get-tweet 0)])
64-
words (get-words 0)
65-
index 1]
66-
(if (< index (count ordered-indices))
67-
(let [words2 (get-words index)
68-
sim (calc-similarity words words2)
69-
similar (> sim similarity-threshold)]
70-
(recur (if similar
71-
popular-tweets
72-
(conj! popular-tweets (get-tweet index)))
73-
(if similar
74-
words
75-
(get-words index))
76-
(inc index)))
77-
(->> (persistent! popular-tweets)
61+
(loop [i 0
62+
unique-indices unique-indices]
63+
(if (< i (count ordered-indices))
64+
(let [words (get-words i)
65+
unique-indices (if (nth unique-indices i)
66+
(loop [j (inc i)
67+
unique-indices unique-indices]
68+
(if (< j (count ordered-indices))
69+
(let [words2 (get-words j)
70+
sim (calc-similarity words words2)
71+
similar (>= sim similarity-threshold)]
72+
(recur (inc j) (if similar
73+
(assoc unique-indices j false)
74+
unique-indices)))
75+
unique-indices))
76+
unique-indices)]
77+
(recur (inc i) unique-indices))
78+
(->> ordered-indices
79+
(map-indexed (fn [idx tweet-idx]
80+
[idx tweet-idx]))
81+
(filter (fn [[idx tweet-idx]]
82+
(nth unique-indices idx)))
83+
(map (comp get-tweet second))
7884
(take hype-count))))
7985
[])))

src/clj/tech_radar/components/analysis.clj

+16-18
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
[clojure.core.async :refer [chan close!]]
99
[taoensso.timbre :as timbre]
1010
[environ.core :refer [env]]
11-
[tech-radar.utils.parsers :refer [parse-int]]
11+
[tech-radar.utils.parsers :refer [parse-int
12+
parse-double]]
1213
[tech-radar.utils.settings :refer [load-classify-settings
1314
load-hashtag-filter-settings]]
1415
[tech-radar.analytics.model :refer [new-model]]
@@ -19,17 +20,13 @@
1920
[tech-radar.services.hype-meter :as hype-meter]))
2021

2122
(defn- get-settings []
22-
{:max-hashtags-per-trend (-> env
23-
(:max-hashtags-per-trend)
23+
{:max-hashtags-per-trend (-> (env :max-hashtags-per-trend)
2424
(parse-int))
25-
:max-texts-per-request (-> env
26-
(:max-texts-per-request)
25+
:max-texts-per-request (-> (env :max-texts-per-request)
2726
(parse-int))
28-
:max-tweet-count (-> env
29-
(:max-tweet-count)
27+
:max-tweet-count (-> (env :max-tweet-count)
3028
(parse-int))
31-
:cache-update-timeout-s (-> env
32-
(:cache-update-timeout-s)
29+
:cache-update-timeout-s (-> (env :cache-update-timeout-s)
3330
(parse-int))})
3431

3532
(defrecord Analysis [database metrics preprocessor
@@ -70,19 +67,20 @@
7067
:metrics metrics})
7168
stop-cache-update-fn (run-cache-update {:model model
7269
:cache cache
73-
:cache-update-timeout-s cache-update-timeout-s})]
70+
:cache-update-timeout-s cache-update-timeout-s})
71+
hype-meter-fn (hype-meter/new-hype-meter-fn {:cache cache
72+
:database database
73+
:topics topics
74+
:hype-tweet-count (-> (env :hype-tweet-count)
75+
(parse-int))
76+
:similarity-threshold (-> (env :similarity-threshold)
77+
(parse-double))})]
7478
(assoc component :stop-hashtags-update-fn stop-hashtags-update-fn
7579
:stop-cache-update-fn stop-cache-update-fn
76-
:hype-meter-job (schedule (fn []
77-
(let [popular-tweets (hype-meter/run-hype-meter {:database database
78-
:topics topics})]
79-
(swap! cache (fn [cache]
80-
(reduce (fn [cache [topic tweets]]
81-
(assoc-in cache [topic :popular] tweets))
82-
cache popular-tweets)))))
80+
:hype-meter-job (schedule hype-meter-fn
8381
(-> (id :hype-meter)
8482
(in 0 :minute)
85-
(every 30 :minutes)))
83+
(every 1 :hours)))
8684
:statistic-fn (fn []
8785
(protocols/statistic model))
8886
:trends-fn (fn []
+28-15
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,31 @@
11
(ns tech-radar.services.hype-meter
22
(:require [tech-radar.analytics.hype-meter :as hype-meter]
3-
[tech-radar.database.tweets :as tweets]))
3+
[tech-radar.database.tweets :as tweets]
4+
[taoensso.timbre :as timbre]))
45

5-
(defn run-hype-meter [{:keys [database topics]}]
6-
(prn "hype-meter: begin")
7-
(let [top (reduce (fn [acc topic]
8-
(let [tweets (tweets/load-daily-tweets-per-topic database {:topic topic
9-
:max-record-count 2000})
10-
_ (prn topic)
11-
_ (prn (count tweets))
12-
popular-tweets (hype-meter/popular-tweets tweets {:stop-words #{}
13-
:hype-count 10})]
14-
(prn topic)
15-
(prn (map :text popular-tweets))
16-
(assoc acc topic popular-tweets))) {} topics)]
17-
(prn "hype-meter: end")
18-
top))
6+
(defn run-hype-meter [{:keys [database topics hype-tweet-count similarity-threshold]}]
7+
(reduce (fn [acc topic]
8+
(let [tweets (tweets/load-daily-tweets-per-topic database {:topic topic
9+
:max-record-count hype-tweet-count})
10+
popular-tweets (hype-meter/popular-tweets tweets {:stop-words #{}
11+
:hype-count 10
12+
:similarity-threshold similarity-threshold})]
13+
(assoc acc topic popular-tweets))) {} topics))
14+
15+
(defn new-hype-meter-fn [{:keys [cache database topics hype-tweet-count similarity-threshold]}]
16+
(fn []
17+
(try
18+
(timbre/info "hype-meter-job: start")
19+
(let [start (. System (nanoTime))
20+
popular-tweets (run-hype-meter {:database database
21+
:topics topics
22+
:hype-tweet-count hype-tweet-count
23+
:similarity-threshold similarity-threshold})]
24+
(swap! cache (fn [cache]
25+
(reduce (fn [cache [topic tweets]]
26+
(assoc-in cache [topic :popular] tweets))
27+
cache popular-tweets)))
28+
(let [elapsed-time (/ (double (- (. System (nanoTime)) start)) 1000000.0)]
29+
(timbre/info (str "hype-meter-job: finish (" elapsed-time " msecs)"))))
30+
(catch Exception ex
31+
(timbre/error ex "hype-meter-job failed")))))

src/clj/tech_radar/utils/parsers.clj

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
(Integer/parseInt v))
88
v)))
99

10+
(defn parse-double [v]
11+
(when v
12+
(or (and (string? v)
13+
(Double/parseDouble v))
14+
v)))
15+
1016
(def months {"Jan" 1
1117
"Feb" 2
1218
"Mar" 3

0 commit comments

Comments
 (0)