Skip to content

Commit d105adc

Browse files
author
abtv
committed
stopwords in hype-meter
1 parent 45833b4 commit d105adc

File tree

4 files changed

+192
-6
lines changed

4 files changed

+192
-6
lines changed

project.clj

+2
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
:max-tweet-count "1000"
8383
:hype-tweet-count "10000"
8484
:similarity-threshold "0.7"
85+
:stop-words "stopwords.txt"
8586
:metrics-timeout-s "100"
8687
:log-path "./logs/tech-radar.log"
8788
:max-log-size-mb "1"
@@ -102,6 +103,7 @@
102103
:max-tweet-count "500000"
103104
:hype-tweet-count "10000"
104105
:similarity-threshold "0.7"
106+
:stop-words "stopwords.txt"
105107
:metrics-timeout-s "300"
106108
:log-path "./logs/tech-radar.log"
107109
:max-log-size-mb "1"

src/clj/tech_radar/components/analysis.clj

+11-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
get-cached-trends]]
1818
[tech-radar.analytics.protocols :as protocols]
1919
[immutant.scheduling :refer [schedule every in stop id]]
20-
[tech-radar.services.hype-meter :as hype-meter]))
20+
[tech-radar.services.hype-meter :as hype-meter]
21+
[clojure.string :as s]))
2122

2223
(defn- get-settings []
2324
{:max-hashtags-per-trend (-> (env :max-hashtags-per-trend)
@@ -29,6 +30,12 @@
2930
:cache-update-timeout-s (-> (env :cache-update-timeout-s)
3031
(parse-int))})
3132

33+
(defn- load-stop-words [file-name]
34+
(->> (slurp file-name)
35+
(s/split-lines)
36+
(filter (comp not s/blank?))
37+
(set)))
38+
3239
(defrecord Analysis [database metrics preprocessor
3340
stop-hashtags-update-fn stop-cache-update-fn
3441
hype-meter-job
@@ -74,7 +81,9 @@
7481
:hype-tweet-count (-> (env :hype-tweet-count)
7582
(parse-int))
7683
:similarity-threshold (-> (env :similarity-threshold)
77-
(parse-double))})]
84+
(parse-double))
85+
:stop-words (-> (env :stop-words)
86+
(load-stop-words))})]
7887
(assoc component :stop-hashtags-update-fn stop-hashtags-update-fn
7988
:stop-cache-update-fn stop-cache-update-fn
8089
:hype-meter-job (schedule hype-meter-fn

src/clj/tech_radar/services/hype_meter.clj

+5-4
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,16 @@
33
[tech-radar.database.tweets :as tweets]
44
[taoensso.timbre :as timbre]))
55

6-
(defn run-hype-meter [{:keys [database topics hype-tweet-count similarity-threshold]}]
6+
(defn run-hype-meter [{:keys [database topics hype-tweet-count similarity-threshold stop-words]}]
77
(reduce (fn [acc topic]
88
(let [tweets (tweets/load-daily-tweets-per-topic database {:topic topic
99
:max-record-count hype-tweet-count})
10-
popular-tweets (hype-meter/popular-tweets tweets {:stop-words #{}
10+
popular-tweets (hype-meter/popular-tweets tweets {:stop-words stop-words
1111
:hype-count 10
1212
:similarity-threshold similarity-threshold})]
1313
(assoc acc topic popular-tweets))) {} topics))
1414

15-
(defn new-hype-meter-fn [{:keys [cache database topics hype-tweet-count similarity-threshold]}]
15+
(defn new-hype-meter-fn [{:keys [cache database topics hype-tweet-count similarity-threshold stop-words]}]
1616
(let [busy (atom false)]
1717
(fn []
1818
(when-not @busy
@@ -23,7 +23,8 @@
2323
popular-tweets (run-hype-meter {:database database
2424
:topics topics
2525
:hype-tweet-count hype-tweet-count
26-
:similarity-threshold similarity-threshold})]
26+
:similarity-threshold similarity-threshold
27+
:stop-words stop-words})]
2728
(swap! cache (fn [cache]
2829
(reduce (fn [cache [topic tweets]]
2930
(assoc-in cache [topic :popular-tweets] tweets))

stopwords.txt

+174
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
a
2+
about
3+
above
4+
after
5+
again
6+
against
7+
all
8+
am
9+
an
10+
and
11+
any
12+
are
13+
aren't
14+
as
15+
at
16+
be
17+
because
18+
been
19+
before
20+
being
21+
below
22+
between
23+
both
24+
but
25+
by
26+
can't
27+
cannot
28+
could
29+
couldn't
30+
did
31+
didn't
32+
do
33+
does
34+
doesn't
35+
doing
36+
don't
37+
down
38+
during
39+
each
40+
few
41+
for
42+
from
43+
further
44+
had
45+
hadn't
46+
has
47+
hasn't
48+
have
49+
haven't
50+
having
51+
he
52+
he'd
53+
he'll
54+
he's
55+
her
56+
here
57+
here's
58+
hers
59+
herself
60+
him
61+
himself
62+
his
63+
how
64+
how's
65+
i
66+
i'd
67+
i'll
68+
i'm
69+
i've
70+
if
71+
in
72+
into
73+
is
74+
isn't
75+
it
76+
it's
77+
its
78+
itself
79+
let's
80+
me
81+
more
82+
most
83+
mustn't
84+
my
85+
myself
86+
no
87+
nor
88+
not
89+
of
90+
off
91+
on
92+
once
93+
only
94+
or
95+
other
96+
ought
97+
our
98+
ours
99+
ourselves
100+
out
101+
over
102+
own
103+
same
104+
shan't
105+
she
106+
she'd
107+
she'll
108+
she's
109+
should
110+
shouldn't
111+
so
112+
some
113+
such
114+
than
115+
that
116+
that's
117+
the
118+
their
119+
theirs
120+
them
121+
themselves
122+
then
123+
there
124+
there's
125+
these
126+
they
127+
they'd
128+
they'll
129+
they're
130+
they've
131+
this
132+
those
133+
through
134+
to
135+
too
136+
under
137+
until
138+
up
139+
very
140+
was
141+
wasn't
142+
we
143+
we'd
144+
we'll
145+
we're
146+
we've
147+
were
148+
weren't
149+
what
150+
what's
151+
when
152+
when's
153+
where
154+
where's
155+
which
156+
while
157+
who
158+
who's
159+
whom
160+
why
161+
why's
162+
with
163+
won't
164+
would
165+
wouldn't
166+
you
167+
you'd
168+
you'll
169+
you're
170+
you've
171+
your
172+
yours
173+
yourself
174+
yourselves

0 commit comments

Comments
 (0)