@@ -133,7 +133,11 @@ function rake(array $tokens, int $ngramSize = 3): \TextAnalysis\Analysis\Keyword
133
133
function stem (array $ tokens , string $ stemmerClassName = \TextAnalysis \Stemmers \PorterStemmer::class): array
134
134
{
135
135
$ stemmer = new $ stemmerClassName ();
136
- return array_map (function ($ token ) use ($ stemmer ){ return $ stemmer ->stem ($ token ); }, $ tokens );
136
+ foreach ($ tokens as &$ token )
137
+ {
138
+ $ token = $ stemmer ->stem ($ token );
139
+ }
140
+ return $ tokens ;
137
141
}
138
142
}
139
143
@@ -224,28 +228,32 @@ function naive_bayes() : \TextAnalysis\Classifiers\NaiveBayes
224
228
}
225
229
226
230
/**
227
- * Return an array of filtered tokens
231
+ * Pass the tokens in by reference and modify them
228
232
* @param array $tokens
229
233
* @param string $filterType
230
- * @return string[]
231
234
*/
232
- function filter_tokens (array $ tokens , string $ filterType ) : array
235
+ function filter_tokens (array & $ tokens , string $ filterType )
233
236
{
234
237
$ className = "\\TextAnalysis \\Filters \\{$ filterType }" ;
235
238
$ filter = new $ className ();
236
- return array_values ( array_map (function ($ token ) use ($ filter ){ return $ filter ->transform ($ token );}, $ tokens ));
239
+ foreach ($ tokens as &$ token )
240
+ {
241
+ $ token = $ filter ->transform ($ token );
242
+ }
237
243
}
238
244
239
245
/**
240
246
* Filter out stop words
241
247
* @param array $tokens
242
248
* @param array $stopwords
243
- * @return array
244
249
*/
245
- function filter_stopwords (array $ tokens , array $ stopwords ) : array
250
+ function filter_stopwords (array & $ tokens , array & $ stopwords )
246
251
{
247
- $ filter = new \TextAnalysis \Filters \StopWordsFilter ($ stopwords );
248
- return array_values ( array_map (function ($ token ) use ($ filter ){ return $ filter ->transform ($ token );}, $ tokens ));
252
+ $ filter = new \TextAnalysis \Filters \StopWordsFilter ($ stopwords );
253
+ foreach ($ tokens as &$ token )
254
+ {
255
+ $ token = $ filter ->transform ($ token );
256
+ }
249
257
}
250
258
251
259
/**
@@ -255,9 +263,89 @@ function filter_stopwords(array $tokens, array $stopwords) : array
255
263
*/
256
264
function get_stop_words (string $ filePath ) : array
257
265
{
258
- return array_map ('trim ' , file ($ filePath ));
266
+ $ rows = file ($ filePath );
267
+ array_walk ($ rows , function (&$ value ){ $ value = trim ($ value ); });
268
+ return $ rows ;
259
269
}
260
270
271
+ /**
272
+ * Return the polarity scores from the vader algorithm
273
+ * @param array $tokens
274
+ * @return array
275
+ */
276
+ function vader (array $ tokens ) : array
277
+ {
278
+ return (new \TextAnalysis \Sentiment \Vader ())->getPolarityScores ($ tokens );
279
+ }
280
+
281
+ /**
282
+ * Filter out all null and empty strings
283
+ * @param array $tokens
284
+ * @return string[]
285
+ */
286
+ function filter_empty (array $ tokens ) : array
287
+ {
288
+ foreach ($ tokens as &$ token )
289
+ {
290
+ if (empty (trim ($ token ))) {
291
+ $ token = NULL ;
292
+ }
293
+ }
294
+ return array_filter ($ tokens );
295
+ }
296
+
297
+ function score_keeper_sort ($ a , $ b )
298
+ {
299
+ if ($ a ->getScore () == $ b ->getScore ()) {
300
+ return 0 ;
301
+ }
302
+ return ($ a ->getScore () < $ b ->getScore ()) ? 1 : -1 ;
303
+ }
304
+
305
+ /**
306
+ * Apply common filters and
307
+ * @param string $text
308
+ * @param array $stopwords
309
+ * @return array
310
+ */
311
+ function summary_simple (string $ text , array $ stopwords = []) : array
312
+ {
313
+ $ sentenceTokensOriginal = (new \TextAnalysis \Tokenizers \VanderleeTokenizer ())->tokenize (strtolower ($ text ));
314
+
315
+ //create copy
316
+ $ sentenceTokens = $ sentenceTokensOriginal ;
317
+ if (!empty ($ stopwords )) {
318
+ foreach ($ sentenceTokens as &$ sentence )
319
+ {
320
+ $ sentence = str_replace ($ stopwords , " " , $ sentence );
321
+ }
322
+ }
323
+
324
+ filter_tokens ($ sentenceTokens , 'TrimFilter ' );
325
+ filter_tokens ($ sentenceTokens , 'QuotesFilter ' );
326
+ filter_tokens ($ sentenceTokens , 'CharFilter ' );
327
+
328
+ $ wordTokens = tokenize ($ text );
329
+ foreach (['LowerCaseFilter ' ,'PunctuationFilter ' ,'QuotesFilter ' ,'PossessiveNounFilter ' ,'CharFilter ' ] as $ filterType )
330
+ {
331
+ filter_tokens ($ wordTokens , $ filterType );
332
+ }
333
+
334
+ if (!empty ($ stopwords )) {
335
+ filter_stopwords ($ wordTokens , $ stopwords );
336
+ }
337
+
338
+ $ summarizer = new \TextAnalysis \Analysis \Summarize \Simple ();
339
+ $ scores = $ summarizer ->summarize (filter_empty ( $ wordTokens ), $ sentenceTokens );
340
+
341
+ // reorder sentences in the best order
342
+ $ bestSentences = [];
343
+ foreach ($ scores as $ score )
344
+ {
345
+ $ bestSentences [] = $ sentenceTokensOriginal [$ score ->getIndex ()];
346
+ }
347
+ return $ bestSentences ;
348
+ }
261
349
262
350
263
351
0 commit comments