-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews-aggregator.cc~
221 lines (207 loc) · 6.58 KB
/
news-aggregator.cc~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
/**
* File: news-aggregator.cc
* ------------------------
* When fully implements, pulls and parses every single
* news article reachable from some RSS feed in the user-supplied
* RSS News Feed XML file, and then allows the user to query the
* index.
*/
#include <iostream>
#include <iomanip>
#include <string>
#include <vector>
#include <algorithm>
#include <mutex>
#include <thread>
#include <memory>
#include <libxml/parser.h>
#include <libxml/catalog.h>
#include "ostreamlock.h"
#include "semaphore.h"
#include "article.h"
#include "rss-feed-list.h"
#include "rss-feed.h"
#include "rss-index.h"
#include "html-document.h"
#include "html-document-exception.h"
#include "rss-feed-exception.h"
#include "rss-feed-list-exception.h"
#include "news-aggregator-utils.h"
#include "string-utils.h"
using namespace std;
static const unsigned int kMaxfeed = 8;
static const unsigned int kMaxTperS= 12;
static const unsigned int kMaxthread=64;
static RSSIndex index;
static map<string, unique_ptr<semaphore>> serverlocks;
static semaphore feedsAllowed(kMaxfeed);
static semaphore threadsAllowed(kMaxthread);
static mutex rssindexlock;
static mutex servermaplock;
/**
* Function: printUsage
* --------------------
* Prints usage information. Should be invoked whenever the
* user fails to provide a RSS feed name.
*/
static void printUsage(const string& executableName) {
cerr << "Usage: " << executableName << " <feed-url>" << endl;
}
/*
*Helper Function:printFeeds
*Print out all the feeds in the feedList.
*/
/*static void printFeeds(const map<string, string>& feeds){
int count=0;
for(auto t: feeds){
cout<<"["<<count++<<"]"<<"title: "<<t.second<<endl<<"url: "<<t.first<<endl;
}
}
*/
/*
*Method: articletoTokens
*Pull the html file of the article and get the tokens in it.
*/
static void articletoTokens(const Article& article,unique_ptr<semaphore>& up){
threadsAllowed.wait();
string title = article.title;
if (shouldTruncate(title)) title = truncate(title);
string url = article.url;
HTMLDocument htmlDoc(url);
if (shouldTruncate(url)) url = truncate(url);
cout<<oslock<<" " << setw(2) << setfill(' ')<<"Parsing \""<<title<<"\""<<endl<<osunlock;
cout<<oslock<<" " << setw(6) << setfill(' ')<<"[at \""<<url<<"\"]"<<endl<<osunlock;
try {
htmlDoc.parse();
} catch (const HTMLDocumentException& htmle) {
cerr << "Ran into trouble while pulling full html document from \""
<< htmlDoc.getURL() << "\"." << endl;
cerr << "Aborting...." << endl;
exit(0);
}
const vector<string>& tokens = htmlDoc.getTokens();
rssindexlock.lock();
index.add(article,tokens);
rssindexlock.unlock();
threadsAllowed.signal();
up->signal();
}
/*
*Method: feedtoTokens
*Pull the articles from the feed.and call articletoTokens
*/
static void feedtoTokens(const pair<string,string>& feed){
cout<<"Begin full download of feed URI: "<<feed.first<<endl;
RSSFeed rssfeed(feed.first);
try {
rssfeed.parse();
} catch (const RSSFeedException& rfe) {
cerr << "Ran into trouble while pulling full RSS feed from \""
<< feed.first << "\"." << endl;
cerr << "Aborting...." << endl;
servermaplock.unlock();
rssindexlock.unlock();
exit(0);
}
vector<thread> articlethreads;
const vector<Article>& articles = rssfeed.getArticles();
for(const Article& article: articles){
string serverurl=getURLServer(article.url);
servermaplock.lock();
unique_ptr<semaphore>& up=serverlocks[serverurl];
if(up==nullptr){
up.reset(new semaphore(kMaxTperS));
}
servermaplock.unlock();
up->wait();
articlethreads.push_back(thread(articletoTokens,article,std::move(up)));
}
for (thread& t: articlethreads) t.join();
feedsAllowed.signal();
cout<<oslock<<"End full download of feed URI: "<<feed.first<<endl<<osunlock;
}
static void processAllFeeds(const string& feedListURI) {
vector<thread> feedthreads;
RSSFeedList feedList(feedListURI);
try {
feedList.parse();//Pulls the content from the encapsulated URL
} catch (const RSSFeedListException& rfle) {
cerr << "Ran into trouble while pulling full RSS feed list from \""
<< feedListURI << "\"." << endl;
cerr << "Aborting...." << endl;
exit(0);
}
auto allfeeds=feedList.getFeeds();
//printFeeds(allfeeds);
for(auto feed: allfeeds){
feedsAllowed.wait();
feedthreads.push_back(thread(feedtoTokens,feed));
}
for (thread& t: feedthreads) t.join();
// add well-decomposed code to read all of the RSS news feeds from feedList
// for their news articles, and for each news article URL, process it
// as an HTMLDocument and add all of the tokens to the master RSSIndex.
}
/**
* Function: queryIndex
* --------------------
* queryIndex repeatedly prompts the user for search terms, and
* for each nonempty search term returns the list of matching documents,
* ranked by frequency.
*/
static const size_t kMaxMatchesToShow = 15;
static void queryIndex() {
while (true) {
cout << "Enter a search term [or just hit <enter> to quit]: ";
string response;
getline(cin, response);
response = trim(response);
if (response.empty()) break;
const vector<pair<Article, int> >& matches = index.getMatchingArticles(response);
if (matches.empty()) {
cout << "Ah, we didn't find the term \"" << response << "\". Try again." << endl;
} else {
cout << "That term appears in " << matches.size() << " article"
<< (matches.size() == 1 ? "" : "s") << ". ";
if (matches.size() > kMaxMatchesToShow)
cout << "Here are the top " << kMaxMatchesToShow << " of them:" << endl;
else
cout << "Here they are:" << endl;
size_t count = 0;
for (const pair<Article, int>& match: matches) {
if (count == kMaxMatchesToShow) break;
count++;
string title = match.first.title;
if (shouldTruncate(title)) title = truncate(title);
string url = match.first.url;
if (shouldTruncate(url)) url = truncate(url);
string times = match.second == 1 ? "time" : "times";
cout << " " << setw(2) << setfill(' ') << count << ".) "
<< "\"" << title << "\" [appears " << match.second << " " << times << "]." << endl;
cout << " \"" << url << "\"" << endl;
}
}
}
}
/**
* Function: main
* --------------
* Defines the entry point into the entire executable.
*/
int main(int argc, const char *argv[]) {
if (argc != 2) {
cerr << "Error: wrong number of arguments." << endl;
printUsage(argv[0]);
exit(0);
}
string rssFeedListURI = argv[1];
xmlInitParser();
xmlInitializeCatalog();
processAllFeeds(rssFeedListURI);
xmlCatalogCleanup();
xmlCleanupParser();
cout << endl;
queryIndex();
cout << "Exiting...." << endl;
return 0;
}