-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathRakefile
107 lines (97 loc) · 3.44 KB
/
Rakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
require 'bundler/setup'
require 'awesome_print'
require 'json'
require 'pry'
desc 'Generate json files from markdown documents'
task :default do
require 'stopwords'
require 'lingua/stemmer'
require 'babosa'
# require 'matrix'
require 'narray'
require 'tf-idf-similarity'
# load files from documents/*.md
puts 'Reading markdown documents...'
documents_paths = Dir.glob('documents/*.md').sort
documents = {}
documents_paths.each_with_index do |path, document_index|
text = File.read(path)
# create hash { filename: { paragraph_id: "Paragraphtext" } }
paragraphs = text.scan(/\#{1,3}[^#]+/).map(&:strip)
title = paragraphs.shift.gsub(/#\s*/, '')
documents[title] = paragraphs.each_with_index.map do |text, paragraph_index|
{ id: "d#{document_index}p#{paragraph_index}", text: text }
end
end
puts 'done.'
File.open('./data/documents.json', 'w+') do |f|
f.write documents.to_json
puts 'writing ./data/documents.json'
end
# create corpus from preprocessed paragraph texts
stopwords = Stopwords::Snowball::Filter.new('de', Stopwords::Snowball::Filter.new('de').stopwords.map(&:capitalize))
stemmer = Lingua::Stemmer.new(language: 'de')
# create tfidf model
corpus = []
unstemmed_corpus = []
documents.each do |title, paragraphs|
paragraphs.each do |paragraph|
text = paragraph[:text]
# unhypenate
text = text.gsub(/(\w)-[\s\n]+(?!und)/,"#{$1}")
# transliterate
text = text.to_slug.transliterate(:german).to_s
# lowercase
text = text.downcase
# remove numbers
text = text.gsub(/\d/,'')
# remove remaining punctuation
text = text.gsub(/[^a-zA-ZüÜöÖäÄ\s]/, '')
# tokenize
tokens = text.strip.split(/[\s\r\n]+/)
# stopwords
tokens = stopwords.filter tokens
unstemmed_corpus << TfIdfSimilarity::Document.new(paragraph[:id], tokens: tokens)
# stem
tokens = tokens.map { |token| stemmer.stem(token) }
corpus << TfIdfSimilarity::Document.new(paragraph[:id], tokens: tokens)
end
end
puts 'Computing tf/idf model...'
model = TfIdfSimilarity::BM25Model.new(corpus, library: :narray)
unstemmed_model = TfIdfSimilarity::BM25Model.new(unstemmed_corpus, library: :narray)
puts 'done.'
# calculate important terms from unstemmed corpus
puts 'Calculating important terms...'
important = {}
corpus.each_with_index do |doc, i|
terms_with_weights = unstemmed_model.terms.zip unstemmed_model.instance_variable_get('@matrix').transpose[i,0..-1]
important[doc.text] = terms_with_weights.sort_by { |term, weight| -weight }[0..2].inject({}) { |h, (term, weight)| h.update term => weight }
end
puts 'done.'
File.open('./data/important.json', 'w+') do |f|
f.write important.to_json
puts 'writing ./data/important.json'
end
# write distance matrix from stemmed corpus
puts 'Calculating paragraph distances...'
similarity_matrix = model.similarity_matrix
distances = {}
corpus.each_with_index do |lhs, i|
distances[lhs.text] = {}
sum = 0.0
corpus.each_with_index do |rhs, j|
sum += distances[lhs.text][rhs.text] = similarity_matrix[i,j]
end
distances[lhs.text][:avg] = sum/corpus.length
end
puts 'done.'
File.open('./data/distances.json', 'w+') do |f|
f.write distances.to_json
puts 'writing ./data/distances.json'
end
File.open('./debug.csv', 'w+') do |f|
f.puts 'val'
distances.each { |k,v| v.each { |k,v| f.puts v } }
end
end