-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbundle.js
5645 lines (5119 loc) · 230 KB
/
bundle.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(function(){function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);var a=new Error("Cannot find module '"+i+"'");throw a.code="MODULE_NOT_FOUND",a}var p=n[i]={exports:{}};e[i][0].call(p.exports,function(r){var n=e[i][1][r];return o(n||r)},p,p.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i<t.length;i++)o(t[i]);return o}return r})()({1:[function(require,module,exports){
// ### getFoundTerms
/**
*
* Obtains the spotted search terms from the resultant text.
*
* @param {array[]} results contains the search results.
* @param {string} query being searched.
* @param {Object[]} docs being searched.
* @param {string[]} fields of the `docs`.
* @param {function[]} pipe in use for prep task.
* @param {number} rwIndex index of `removeWords()` function.
* @return {string[]} of search terms found in the `results` `docs`.
*/
var getSpottedTerms = function ( results, query, docs, fields, pipe, rwIndex ) {
// Upto the `removeWords` pipe.
var pipe1 = pipe.slice( 0, rwIndex );
// From `removeWords` and beyond pipe.
var pipe2 = pipe.slice( rwIndex );
// Copy of query.
var q = query.slice( 0 );
// Total text for search `results` for `fields`.
var t = [];
// Spotted terms.
var st = Object.create( null );
// Empty results => empty found terms!
if ( results.length === 0 ) return [];
// Transform query as per the overall pipe.
for ( let i = 0; i < pipe.length; i += 1 ) {
q = pipe[ i ]( q );
}
// Extract total text from all fields of resultant docs.
results.forEach( function ( r ) {
fields.forEach( ( f ) => ( t.push( docs[r[ 0 ]][ f ] ) ) );
} );
t = t.join( ' ' );
pipe1.forEach( function ( f ) {
t = f( t );
} );
// It is text that has been LowerCased, tokenized, and stop words removed.
var tRef = t.slice( 0 );
// Now stem & negation handling – means words might get transformed i.e.
// stemmed and may be prefixed with `!` due to negation.
pipe2.forEach( function ( f ) {
t = f( t );
} );
// Build a list of spotted terms by searching `t[ i ]` in `q` and
// if found, build the `st` using the corresponding `tRef[ i ]`.
for ( let i = 0; i < t.length; i += 1 ) {
if ( q.indexOf( t[ i ] ) !== -1 ) {
st[ tRef[ i ] ] = true;
}
}
// Convert to array & return!
return Object.keys( st );
};
module.exports = getSpottedTerms;
},{}],2:[function(require,module,exports){
module.exports=[
{
"title": "Barack Obama",
"body": "Barack Hussein Obama II born August 4, 1961 is an American politician who served as the 44th President of the United States from 2009 to 2017. He is the first African American to have served as president. He previously served in the U.S. Senate representing Illinois from 2005 to 2008, and in the Illinois State Senate from 1997 to 2004."
},
{
"title": "Michelle Obama",
"body": "Michelle LaVaughn Robinson Obama (born January 17, 1964) is an American lawyer and writer who was First Lady of the United States from 2009 to 2017. She is married to the 44th President of the United States, Barack Obama, and was the first African-American First Lady. Raised on the South Side of Chicago, Illinois, Obama is a graduate of Princeton University and Harvard Law School, and spent her early legal career working at the law firm Sidley Austin, where she met her husband. She subsequently worked as the Associate Dean of Student Services at the University of Chicago and the Vice President for Community and External Affairs of the University of Chicago Medical Center. Barack and Michelle married in 1992 and have two daughters."
},
{
"title": "William meaning bill clinton",
"body": "William Jefferson Clinton (born William Jefferson Blythe III; August 19, 1946), commonly known as Bill Clinton, is an American politician who served as the 42nd President of the United States from 1993 to 2001. Prior to the Presidency he was the 40th Governor of Arkansas from 1979 to 1981 and the state's 42nd Governor from 1983 to 1992. Before that, he served as Arkansas Attorney General from 1977 to 1979. A member of the Democratic Party, Clinton was ideologically a New Democrat, and many of his policies reflected a centrist political philosophy. Clinton was born and raised in Arkansas and is an alumnus of Georgetown University, where he was a member of Kappa Kappa Psi and the Phi Beta Kappa Society; he earned a Rhodes Scholarship to attend the University of Oxford. Clinton is married to Hillary Rodham Clinton, who served as United States Secretary of State from 2009 to 2013 and U.S. Senator from New York from 2001 to 2009, and was the Democratic nominee for President in 2016. Bill Clinton and Hillary Rodham both earned degrees from Yale Law School, where they met and began dating. As Governor of Arkansas, Clinton overhauled the state's education system and served as chairman of the National Governors Association."
},
{
"title": "Hillary Rodham Clinton",
"body": "Hillary Diane Rodham Clinton (/ˈhɪləri daɪˈæn ˈrɒdəm ˈklɪntən/; born October 26, 1947) is an American politician who was the 67th United States Secretary of State from 2009 to 2013, U.S. Senator from New York from 2001 to 2009, First Lady of the United States from 1993 to 2001, and the Democratic Party's nominee for President of the United States in the 2016 election. Born in Chicago, Illinois, and raised in the Chicago suburb of Park Ridge, Clinton graduated from Wellesley College in 1969, and earned a J.D. from Yale Law School in 1973. After serving as a congressional legal counsel, she moved to Arkansas and married Bill Clinton in 1975. In 1977, she co-founded Arkansas Advocates for Children and Families. She was appointed the first female chair of the Legal Services Corporation in 1978 and became the first female partner at Rose Law Firm the following year. As First Lady of Arkansas, she led a task force whose recommendations helped reform Arkansas's public schools."
},
{
"title": "George W Bush",
"body": "George Walker Bush (born July 6, 1946) is an American politician who served as the 43rd President of the United States from 2001 to 2009. He was also the 46th Governor of Texas from 1995 to 2000. After graduating from Yale University in 1968 and Harvard Business School in 1975, he worked in the oil industry. He never studied Law. Bush married Laura Welch in 1977 and ran unsuccessfully for the House of Representatives shortly thereafter. He later co-owned the Texas Rangers baseball team before defeating Ann Richards in the 1994 Texas gubernatorial election. Bush was elected president in 2000 after a close and controversial win over Democratic rival Al Gore, becoming the fourth president to be elected while receiving fewer popular votes than his opponent.[3]"
},
{
"title": "laura W Bush",
"body": "Laura Lane Welch Bush (born November 4, 1946) is the wife of the 43rd President of the United States, George W. Bush, and was the First Lady from 2001 to 2009.[1][2] Bush graduated from Southern Methodist University in 1968 with a bachelor's degree in education, and took a job as a second grade teacher. After attaining her master's degree in library science at the University of Texas at Austin, she was employed as a librarian. Bush met her future husband, George W. Bush, in 1977, and they were married later that year. The couple had twin daughters in 1981. Bush's political involvement began during her marriage. She campaigned with her husband during his unsuccessful 1978 run for the United States Congress, and later for his successful Texas gubernatorial campaign."
},
{
"title": "George H W Bush",
"body": "George Herbert Walker Bush (born June 12, 1924) is an American politician who was the 41st President of the United States from 1989 to 1993 and the 43rd Vice President of the United States from 1981 to 1989. A member of the Republican Party, he was previously a congressman, ambassador, and Director of Central Intelligence. He is the oldest living former President and Vice President. Since 2000, Bush has often been referred to as George H. W. Bush, Bush 41, Bush the Elder, or George Bush Senior to distinguish him from his eldest son, George W. Bush, who became the 43rd President of the United States after the 2000 election."
},
{
"title": "Barbara Bush",
"body": "Barbara Bush (née Pierce; born June 8, 1925) is the wife of George H. W. Bush, the 41st President of the United States, and served as First Lady of the United States from 1989 to 1993. She is the mother of George W. Bush, the 43rd President, and Jeb Bush, the 43rd Governor of Florida. She served as the Second Lady of the United States from 1981 to 1989. Barbara Pierce was born in Flushing, New York. She attended Milton Public School from 1931 to 1937, and Rye Country Day School from 1937-1940. She graduated from Ashley Hall School in Charleston, South Carolina. She met George Herbert Walker Bush at age 16, and the two married in Rye, New York in 1945, while he was on leave during his deployment as a Naval officer in World War II. While George was attending Yale University at age 22, Barbara and George were living in New Haven, Connecticut and had their first son, George Walker Bush, on July 6, 1946. (Thus, her first son, the eventual 43rd President of the United States, was the first Connecticut native to assume that office. George W. would eventually return to his hometown of New Haven in 1964 to attend Yale like his father did.) They had six children together. The Bush family soon moved to Midland, Texas, where their second son, Jeb was born in, on February 11, 1953; as George Bush entered political life, she raised their children."
},
{
"title": "Ronald Reagan",
"body": "Ronald Wilson Reagan (/ˈrɒnəld ˈwɪlsən ˈreɪɡən/) (February 6, 1911 – June 5, 2004) was an American politician and actor who served as the 40th President of the United States from 1981 to 1989. Before his presidency, he was the 33rd Governor of California, from 1967 to 1975, after a career as a Hollywood actor and union leader. Raised in a poor family in small towns of northern Illinois, Reagan graduated from Eureka College in 1932 and worked as a sports announcer on several regional radio stations. After moving to Hollywood in 1937, he became an actor and starred in a few major productions. Reagan was twice elected President of the Screen Actors Guild, the labor union for actors, where he worked to root out Communist influence."
},
{
"title": "Nancy Reagan",
"body": "Nancy Davis Reagan (born Anne Frances Robbins; July 6, 1921 – March 6, 2016) was an American film actress, and the wife of the 40th President of the United States, Ronald Reagan. She served as the First Lady of the United States from 1981 to 1989. She was born in New York City. After her parents separated, she lived in Maryland with an aunt and uncle for some years. She moved to Chicago when her mother remarried in 1929, and later took the name Davis from her stepfather. As Nancy Davis, she was a Hollywood actress in the 1940s and 1950s, starring in films such as The Next Voice You Hear..., Night into Morning, and Donovan's Brain. In 1952, she married Ronald Reagan, who was then president of the Screen Actors Guild. They had two children together. Reagan was the First Lady of California when her husband was Governor from 1967 to 1975, and she began to work with the Foster Grandparents Program."
}
]
},{}],3:[function(require,module,exports){
// wink-bm25-text-search
// Fast Full Text Search based on BM25F
//
// Copyright (C) 2017-19 GRAYPE Systems Private Limited
//
// This file is part of “wink-bm25-text-search”.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
var helpers = require( 'wink-helpers' );
/* eslint guard-for-in: 0 */
/* eslint complexity: [ "error", 25 ] */
// It is a BM25F In-memory Search engine for text and exposes following
// methods:
// 1. `definePrepTasks` allows to define field-wise (optional) pipeline of
// functions that will be used to prepare each input prior to *search/predict*
// and *addDoc/learn*.
// 2. `defineConfig` sets up the configuration for *field-wise weights*,
// *BM25F parameters*, and **field names whoes original value** needs to be retained.
// 3. `addDoc/learn` adds a document using its unique id. The document is supplied
// as an Javascript object, where each property is the field of the document
// and its value is the text.
// 4. `consolidate` learnings prior to search/predict.
// 5. `search/predict` searches for the input text and returns the resultant
// document ids, sorted by their relevance along with the score. The number of
// results returned can be controlled via a limit argument that defaults to **10**.
// The last optional argument is a filter function that must return a `boolean`
// value, which is used to filter documents.
// 6. `exportJSON` exports the learnings in JSON format.
// 7. `importJSON` imports the learnings from JSON that may have been saved on disk.
// 8. `reset` all the learnings except the preparatory tasks.
var bm25fIMS = function () {
// Preparatory tasks that are executed on the `addDoc` & `search` input.
var pTasks = [];
// And its count.
var pTaskCount;
// Field level prep tasks.
var flds = Object.create( null );
// Returned!
var methods = Object.create( null );
// Term Frequencies & length of each document.
var documents = Object.create( null );
// Inverted Index for faster search
var invertedIdx = [];
// IDF for each tokens, tokens are referenced via their numerical index.
var idf = [];
// Set true on first call to `addDoc/learn` to prevent changing config.
var learned = false;
// The `addDoc()predict()` function checks for this being true; set
// in `consolidate()`.
var consolidated = false;
// Total documents added.
var totalDocs = 0;
// Total number of tokens across all documents added.
var totalCorpusLength = 0;
// Their average.
var avgCorpusLength = 0;
// BM25F Configuration; set up in `defineConfig()`.
var config = null;
// The `token: index` mapping; `index` is used everywhere instead
// of the `token`
var token2Index = Object.create( null );
// Index's initial value, incremented with every new word.
var currTokenIndex = 0;
// ### Private functions
// #### Perpare Input
// Prepares the `input` by executing the pipeline of tasks defined in the
// `field` specific `pTasks` set via `definePrepTasks()`.
// If `field` is not specified then default `pTasks` are used.
// If the `field` specific `pTasks` are not defined then it automatically
// switches to default `pTasks`.
var prepareInput = function ( input, field ) {
var processedInput = input;
var pt = ( flds[ field ] && flds[ field ].pTasks ) || pTasks;
var ptc = ( flds[ field ] && flds[ field ].pTaskCount ) || pTaskCount;
for ( var i = 0; i < ptc; i += 1 ) {
processedInput = pt[ i ]( processedInput );
}
return ( processedInput );
}; // prepareInput()
// #### Update Freq
// Updates the `freq` of each term in the `text` after pre-processing it via
// `prepareInput()`; while updating, it takes care of `field's` `weight`.
var updateFreq = function ( id, text, weight, freq, field ) {
// Tokenized `text`.
var tkns = prepareInput( text, field );
// Temp token holder.
var t;
for ( var i = 0, imax = tkns.length; i < imax; i += 1 ) {
t = tkns[ i ];
// Build `token: index` mapping.
if ( token2Index[ t ] === undefined ) {
token2Index[ t ] = currTokenIndex;
currTokenIndex += 1;
}
t = token2Index[ t ];
if ( freq[ t ] === undefined ) {
freq[ t ] = weight;
invertedIdx[ t ] = invertedIdx[ t ] || [];
invertedIdx[ t ].push( id );
} else {
freq[ t ] += weight;
}
}
// Length can not be negative!
return ( tkns.length * Math.abs( weight ) );
}; // updateFreq()
// ### Exposed Functions
// #### Define Prep Tasks
// Defines the `tasks` required to prepare the input for `addDoc` and `search()`
// The `tasks` should be an array of functions; using these function a simple
// pipeline is built to serially transform the input to the output.
// It validates the `tasks` before updating the `pTasks`.
// If validation fails it throws an appropriate error.
// Tasks can be defined separately for each field. However if the field is not
// specified (i.e. `null` or `undefined`), then the `tasks` become default.
// Note, `field = 'search'` is reserved for prep tasks for search string; However
// if the same is not specified, the default tasks are used for pre-processing.
var definePrepTasks = function ( tasks, field ) {
if ( config === null ) {
throw Error( 'winkBM25S: Config must be defined before defining prepTasks.' );
}
if ( !helpers.array.isArray( tasks ) ) {
throw Error( 'winkBM25S: Tasks should be an array, instead found: ' + JSON.stringify( tasks ) );
}
for ( var i = 0, imax = tasks.length; i < imax; i += 1 ) {
if ( typeof tasks[ i ] !== 'function' ) {
throw Error( 'winkBM25S: Tasks should contain function, instead found: ' + ( typeof tasks[ i ] ) );
}
}
var fldWeights = config.fldWeights;
if ( field === undefined || field === null ) {
pTasks = tasks;
pTaskCount = tasks.length;
} else {
if ( !fldWeights[ field ] || typeof field !== 'string' ) {
throw Error( 'winkBM25S: Field name is missing or it is not a string: ' + JSON.stringify( field ) + '/' + ( typeof field ) );
}
flds[ field ] = flds[ field ] || Object.create( null );
flds[ field ].pTasks = tasks;
flds[ field ].pTaskCount = tasks.length;
}
return tasks.length;
}; // definePrepTasks()
// #### Define Config
// Defines the configuration for BM25F using `fldWeights` and `bm25Params`
// properties of `cfg` object.</br>
// The `fldWeights` defines the weight for each field of the document. This gives
// a semantic nudge to search and are used as a mutiplier to the count
// (frequency) of each token contained in that field of the document. It should
// be a JS object containing `field-name/value` pairs. If a field's weight is
// not defined, that field is **ignored**. The field weights must be defined before
// attempting to add a document via `addDoc()`; they can only be defined once.
// If any document's field is not defined here then that field is **ignored**.
// </br>
// The `k`, `b` and `k1` properties of `bm25Params` object define the smoothing
// factor for IDF, degree of normalization for TF, and saturation control factor
// respectively for the BM25F. Their default values are **1**, **0.75**, and
// **1.2**.<br/>
// The `ovFieldNames` is an array of field names whose original value needs to
// be retained.
var defineConfig = function ( cfg ) {
if ( learned ) {
throw Error( 'winkBM25S: config must be defined before learning/addition starts!' );
}
if ( !helpers.object.isObject( cfg ) ) {
throw Error( 'winkBM25S: config must be a config object, instead found: ' + JSON.stringify( cfg ) );
}
// If `fldWeights` are absent throw error.
if ( !helpers.object.isObject( cfg.fldWeights ) ) {
throw Error( 'winkBM25S: fldWeights must be an object, instead found: ' + JSON.stringify( cfg.fldWeights ) );
}
// There should be at least one defined field!
if ( ( helpers.object.keys( cfg.fldWeights ) ).length === 0 ) {
throw Error( 'winkBM25S: Field config has no field defined.' );
}
// Setup configuration now.
config = Object.create( null );
// Field config for BM25**F**
config.fldWeights = Object.create( null );
config.bm25Params = Object.create( null );
// **Controls TF part:**<br/>
// `k1` controls saturation of token's frequency; higher value delays saturation
// with increase in frequency.
config.bm25Params.k1 = 1.2;
// `b` controls the degree of normalization; **0** means no normalization and **1**
// indicates complete normalization!
config.bm25Params.b = 0.75;
// **Controls IDF part:**<br/>
// `k` controls impact of IDF; should be >= 0; a higher value means lower
// the impact of IDF.
config.bm25Params.k = 1;
// Setup field weights.
for ( var field in cfg.fldWeights ) {
// The `null` check is required as `isNaN( null )` returns `false`!!
// This first ensures non-`null/undefined/0` values before testing for NaN.
if ( !cfg.fldWeights[ field ] || isNaN( cfg.fldWeights[ field ] ) ) {
throw Error( 'winkBM25S: Field weight should be number >0, instead found: ' + JSON.stringify( cfg.fldWeights[ field ] ) );
}
// Update config parameters from `cfg`.
config.fldWeights[ field ] = ( +cfg.fldWeights[ field ] );
}
// Setup BM25F params.
// Create `bm25Params` if absent in `cfg`.
if ( !helpers.object.isObject( cfg.bm25Params ) ) cfg.bm25Params = Object.create( null );
// Update config parameters from `cfg`.
config.bm25Params.b = (
( cfg.bm25Params.b === null ) ||
( cfg.bm25Params.b === undefined ) ||
( isNaN( cfg.bm25Params.b ) ) ||
( +cfg.bm25Params.b < 0 || +cfg.bm25Params.b > 1 )
) ? 0.75 : +cfg.bm25Params.b;
// Update config parameters from `cfg`.
config.bm25Params.k1 = (
( cfg.bm25Params.k1 === null ) ||
( cfg.bm25Params.k1 === undefined ) ||
( isNaN( cfg.bm25Params.k1 ) ) ||
( +cfg.bm25Params.k1 < 0 )
) ? 1.2 : +cfg.bm25Params.k1;
// Update config parameters from `cfg`.
config.bm25Params.k = (
( cfg.bm25Params.k === null ) ||
( cfg.bm25Params.k === undefined ) ||
( isNaN( cfg.bm25Params.k ) ) ||
( +cfg.bm25Params.k < 0 )
) ? 1 : +cfg.bm25Params.k;
// Handle configuration for fields whose orginal values has to be retained
// in the document.<br/>
// Initialize the `ovFldNames` in the final `config` as an empty array
config.ovFldNames = [];
if ( !cfg.ovFldNames ) cfg.ovFldNames = [];
if ( !helpers.array.isArray(cfg.ovFldNames) ) {
throw Error( 'winkBM25S: OV Field names should be an array, instead found: ' + JSON.stringify( typeof cfg.ovFldNames ) );
}
cfg.ovFldNames.forEach( function ( f ) {
if ( ( typeof f !== 'string' ) || ( f.length === 0 ) ) {
throw Error( 'winkBM25S: OV Field name should be a non-empty string, instead found: ' + JSON.stringify( f ) );
}
config.ovFldNames.push( f );
} );
return true;
}; // defineConfig()
// #### Add Doc
// Adds a document to the model using `updateFreq()` function.
var addDoc = function ( doc, id ) {
if ( config === null ) {
throw Error( 'winkBM25S: Config must be defined before adding a document.' );
}
var fldWeights = config.fldWeights;
// No point in adding/learning further in absence of consolidated.
if ( consolidated ) {
throw Error( 'winkBM25S: post consolidation adding/learning is not possible!' );
}
// Set learning/addition started.
learned = true;
var length;
if ( documents[ id ] !== undefined ) {
throw Error( 'winkBM25S: Duplicate document encountered: ' + JSON.stringify( id ) );
}
documents[ id ] = Object.create( null );
documents[ id ].freq = Object.create( null );
documents[ id ].fieldValues = Object.create( null );
documents[ id ].length = 0;
// Compute `freq` & `length` of the specified fields.
for ( var field in fldWeights ) {
if ( doc[ field ] === undefined ) {
throw Error( 'winkBM25S: Missing field in the document: ' + JSON.stringify( field ) );
}
length = updateFreq( id, doc[ field ], fldWeights[ field ], documents[ id ].freq, field );
documents[ id ].length += length;
totalCorpusLength += length;
}
// Retain Original Field Values, if configured.
config.ovFldNames.forEach( function ( f ) {
if ( doc[ f ] === undefined ) {
throw Error( 'winkBM25S: Missing field in the document: ' + JSON.stringify( f ) );
}
documents[ id ].fieldValues[ f ] = doc[ f ];
} );
// Increment total documents indexed so far.
totalDocs += 1;
return ( totalDocs );
}; // addDoc()
// #### Consolidate
// Consolidates the data structure of bm25 and computes the IDF. This must be
// built before using the `search` function. The `fp` defines the precision at
// which term frequency values are stored. The default value is **4**. In cause
// of an invalid input, it default to 4. The maximum permitted value is 9; any
// value larger than 9 is forced to 9.
var consolidate = function ( fp ) {
if ( consolidated ) {
throw Error( 'winkBM25S: consolidation can be carried out only once!' );
}
if ( totalDocs < 3 ) {
throw Error( 'winkBM25S: document collection is too small for consolidation; add more docs!' );
}
var freqPrecision = parseInt( fp, 10 );
freqPrecision = ( isNaN( freqPrecision ) ) ? 4 :
( freqPrecision < 4 ) ? 4 :
( freqPrecision > 9 ) ? 9 : freqPrecision;
// Using the commonly used names but unfortunately they are very cryptic and
// *short*. **Must not use these variable names elsewhere**.
var b = config.bm25Params.b;
var k1 = config.bm25Params.k1;
var k = config.bm25Params.k;
var freq, id, n, normalizationFactor, t;
// Consolidate: compute idf; will multiply with freq to save multiplication
// time during search. This happens in the next loop-block.
for ( var i = 0, imax = invertedIdx.length; i < imax; i += 1 ) {
n = invertedIdx[ i ].length;
idf[ i ] = Math.log( ( ( totalDocs - n + 0.5 ) / ( n + 0.5 ) ) + k );
// To be uncommented to probe values!
// console.log( '%s, %d, %d, %d, %d', t, totalDocs, n, k, idf[ t ] );
}
avgCorpusLength = totalCorpusLength / totalDocs;
// Consolidate: update document frequencies.
for ( id in documents ) {
normalizationFactor = ( 1 - b ) + ( b * ( documents[ id ].length / avgCorpusLength ) );
for ( t in documents[ id ].freq ) {
freq = documents[ id ].freq[ t ];
// Update frequency but ensure the sign is carefully preserved as the
// magnitude of `k1` can jeopardize the sign!
documents[ id ].freq[ t ] = Math.sign( freq ) *
( Math.abs( ( freq * ( k1 + 1 ) ) / ( ( k1 * normalizationFactor ) + freq ) ) *
idf[ t ] ).toFixed( freqPrecision );
// To be uncommented to probe values!
// console.log( '%s, %s, %d', id, t, documents[ id ].freq[ t ] );
}
}
// Set `consolidated` as `true`.
consolidated = true;
return true;
}; // consolidate()
// #### Search
// Searches the `text` and return `limit` results. If `limit` is not sepcified
// then it will return a maximum of **10** results. The `result` is an array of
// containing `doc id` and `score` pairs array. If the `text` is not found, an
// empty array is returned. The `text` must be a string. The argurment `filter`
// is like `filter` of JS Array; it receive an object containing document's
// retained field name/value pairs along with the `params` (which is passed as
// the second argument). It is useful in limiting the search space or making the
// search more focussed.
var search = function ( text, limit, filter, params ) {
// Predict/Search only if learnings have been consolidated!
if ( !consolidated ) {
throw Error( 'winkBM25S: search is not possible unless learnings are consolidated!' );
}
if ( typeof text !== 'string' ) {
throw Error( 'winkBM25S: search text should be a string, instead found: ' + ( typeof text ) );
}
// Setup filter function
var f = ( typeof filter === 'function' ) ?
filter :
function () {
return true;
};
// Tokenized `text`. Use search specific weights.
var tkns = prepareInput( text, 'search' )
// Filter out tokens that do not exists in the vocabulary.
.filter( function ( t ) {
return ( token2Index[ t ] !== undefined );
} )
// Now map them to their respective indexes using `token2Index`.
.map( function ( t ) {
return token2Index[ t ];
} );
// Search results go here as doc id/score pairs.
var results = Object.create( null );
// Helper variables.
var id, ids, t;
var i, imax, j, jmax;
// Iterate for every token in the preapred text.
for ( j = 0, jmax = tkns.length; j < jmax; j += 1 ) {
t = tkns[ j ];
// Use Inverted Idx to look up - accelerates search!<br/>
// Note, `ids` can never be `undefined` as **unknown** tokens have already
// been filtered.
ids = invertedIdx[ t ];
// Means the token exists in the vocabulary!
// Compute scores for every document.
for ( i = 0, imax = ids.length; i < imax; i += 1 ) {
id = ids[ i ];
if ( f( documents[ id ].fieldValues, params ) ) {
results[ id ] = documents[ id ].freq[ t ] + ( results[ id ] || 0 );
}
// To be uncommented to probe values!
/* console.log( '%s, %d, %d, %d', t, documents[ id ].freq[ t ], idf[ t ], results[ id ] ); */
}
}
// Convert to a table in `[ id, score ]` format; sort and slice required number
// of resultant documents.
return ( ( helpers.object.table( results ) )
.sort( helpers.array.descendingOnValue )
.slice( 0, Math.max( ( limit || 10 ), 1 ) )
);
}; // search()
// #### Reset
// Resets the BM25F completely by re-initializing all the learning
// related variables, except the preparatory tasks.
var reset = function () {
// Reset values of variables that are associated with learning; Therefore
// `pTasks` & `pTaskCount` are not re-initialized.
// Term Frequencies & length of each document.
documents = Object.create( null );
// Inverted Index for faster search
invertedIdx = [];
// IDF for each tokens
idf = [];
// Set true on first call to `addDoc/learn` to prevent changing config.
learned = false;
// The `addDoc()predict()` function checks for this being true; set
// in `consolidate()`.
consolidated = false;
// Total documents added.
totalDocs = 0;
// Total number of tokens across all documents added.
totalCorpusLength = 0;
// Their average.
avgCorpusLength = 0;
// BM25F Configuration; set up in `defineConfig()`.
config = null;
// The `token: index` mapping; `index` is used everywhere instead
// of the `token`
token2Index = Object.create( null );
// Index's initial value, incremented with every new word.
currTokenIndex = 0;
return true;
}; // reset()
// #### Export JSON
// Returns the learnings, along with `consolidated` flag, in JSON format.
var exportJSON = function ( ) {
var docStats = Object.create( null );
docStats.totalCorpusLength = totalCorpusLength;
docStats.totalDocs = totalDocs;
docStats.consolidated = consolidated;
return ( JSON.stringify( [
config,
docStats,
documents,
invertedIdx,
currTokenIndex,
token2Index,
// For future expansion but the import will have to have intelligence to
// set the default values and still ensure nothing breaks! Hopefully!!
{},
[],
[]
] ) );
}; // exportJSON()
// #### Import JSON
// Imports the `json` in to index after validating the format of input JSON.
// If validation fails then throws error; otherwise on success import it
// returns `true`. Note, importing leads to resetting the search engine.
var importJSON = function ( json ) {
if ( !json ) {
throw Error( 'winkBM25S: undefined or null JSON encountered, import failed!' );
}
// Validate json format
var isOK = [
helpers.object.isObject,
helpers.object.isObject,
helpers.object.isObject,
helpers.array.isArray,
Number.isInteger,
helpers.object.isObject,
helpers.object.isObject,
helpers.array.isArray,
helpers.array.isArray
];
var parsedJSON = JSON.parse( json );
if ( !helpers.array.isArray( parsedJSON ) || parsedJSON.length !== isOK.length ) {
throw Error( 'winkBM25S: invalid JSON encountered, can not import.' );
}
for ( var i = 0; i < isOK.length; i += 1 ) {
if ( !isOK[ i ]( parsedJSON[ i ] ) ) {
throw Error( 'winkBM25S: invalid JSON encountered, can not import.' );
}
}
// All good, setup variable values.
// First reset everything.
reset();
// To prevent config change.
learned = true;
// Load variable values.
config = parsedJSON[ 0 ];
totalCorpusLength = parsedJSON[ 1 ].totalCorpusLength;
totalDocs = parsedJSON[ 1 ].totalDocs;
consolidated = parsedJSON[ 1 ].consolidated;
documents = parsedJSON[ 2 ];
invertedIdx = parsedJSON[ 3 ];
currTokenIndex = parsedJSON[ 4 ];
token2Index = parsedJSON[ 5 ];
// Return success.
return true;
}; // importJSON()
methods.definePrepTasks = definePrepTasks;
methods.defineConfig = defineConfig;
methods.addDoc = addDoc;
methods.consolidate = consolidate;
methods.search = search;
methods.exportJSON = exportJSON;
methods.importJSON = importJSON;
methods.reset = reset;
// Aliases to keep APIs uniform across.
methods.learn = addDoc;
methods.predict = search;
return ( methods );
}; // bm25fIMS()
module.exports = bm25fIMS;
},{"wink-helpers":5}],4:[function(require,module,exports){
// wink-distance
// Distance functions for Bag of Words, Strings,
// Vectors and more.
//
// Copyright (C) 2017-18 GRAYPE Systems Private Limited
//
// This file is part of “wink-distance”.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
// Soundex Code for alphabets.
/* eslint-disable object-property-newline */
var soundexMap = {
A: 0, E: 0, I: 0, O: 0, U: 0, Y: 0,
B: 1, F: 1, P: 1, V: 1,
C: 2, G: 2, J: 2, K: 2, Q: 2, S: 2, X: 2, Z: 2,
D: 3, T: 3,
L: 4,
M: 5, N: 5,
R: 6
};
// ## string
// ### soundex
/**
*
* Produces the soundex code from the input `word`.
*
* @private
* @param {string} word the input word.
* @param {number} [maxLength=4] of soundex code to be returned.
* @return {string} soundex code of `word`.
* @example
* soundex( 'Burroughs' );
* // -> 'B620'
* soundex( 'Burrows' );
* // -> 'B620'
*/
var soundex = function ( word, maxLength ) {
// Upper case right in the begining.
var s = ( word.length ) ? word.toUpperCase() : '?';
var i,
imax = s.length;
// Soundex code builds here.
var sound = [];
// Helpers - `ch` is a char from `s` and `code/prevCode` are sondex codes
// for consonants.
var ch, code,
prevCode = 9;
// Use default of 4.
var maxLen = maxLength || 4;
// Iterate through every character.
for ( i = 0; i < imax; i += 1 ) {
ch = s[ i ];
code = soundexMap[ ch ];
if ( i ) {
// Means i is > 0.
// `code` is either (a) `undefined` if an unknown character is
// encountered including `h & w`, or (b) `0` if it is vowel, or
// (c) the soundex code for a consonant.
if ( code && code !== prevCode ) {
// Consonant and not adjecant duplicates!
sound.push( code );
} else if ( code !== 0 ) {
// Means `h or w` or an unknown character: ensure `prevCode` is
// remembered so that adjecant duplicates can be handled!
code = prevCode;
}
} else {
// Retain the first letter
sound.push( ch );
}
prevCode = code;
}
s = sound.join( '' );
// Always ensure minimum length of 4 characters for maxLength > 4.
if ( s.length < 4 ) s += '000';
// Return the required length.
return s.substr( 0, maxLen );
}; // soundex()
module.exports = soundex;
},{}],5:[function(require,module,exports){
// wink-helpers
// Functions for cross validation, shuffle, cartesian product and more
//
// Copyright (C) 2017-18 GRAYPE Systems Private Limited
//
// This file is part of “wink-helpers”.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
var helpers = Object.create( null );
// ### Private Functions
// #### Product Reducer (Callback)
// Callback function used by `reduce` inside the `product()` function.
// Follows the standard guidelines of `reduce()` callback function.
var productReducer = function ( prev, curr ) {
var c,
cmax = curr.length;
var p,
pmax = prev.length;
var result = [];
for ( p = 0; p < pmax; p += 1 ) {
for ( c = 0; c < cmax; c += 1 ) {
result.push( prev[ p ].concat( curr[ c ] ) );
}
}
return ( result );
}; // productReducer()
// ### Public Function
// ### Array Helpers
helpers.array = Object.create( null);
// #### is Array
// Tests if argument `v` is a JS array; returns `true` if it is, otherwise returns `false`.
helpers.array.isArray = function ( v ) {
return ( ( v !== undefined ) && ( v !== null ) && ( Object.prototype.toString.call( v ) === '[object Array]' ) );
}; // isArray()
// #### sorting helpers
// Set of helpers to sort either numbers or strings. For key/value pairs,
// the format for each element must be `[ key, value ]`.
// Sort helper to sort an array in ascending order.
helpers.array.ascending = function ( a, b ) {
return ( a > b ) ? 1 :
( a === b ) ? 0 : -1;
}; // ascending()
// Sort helper to sort an array in descending order.
helpers.array.descending = function ( a, b ) {
return ( b > a ) ? 1 :
( b === a ) ? 0 : -1;
}; // descending()
// Sort helper to sort an array of `[ key, value ]` in ascending order by **key**.
helpers.array.ascendingOnKey = function ( a, b ) {
return ( a[ 0 ] > b[ 0 ] ) ? 1 :
( a[ 0 ] === b[ 0 ] ) ? 0 : -1;
}; // ascendingOnKey()
// Sort helper to sort an array of `[ key, value ]` in descending order by **key**.
helpers.array.descendingOnKey = function ( a, b ) {
return ( b[ 0 ] > a[ 0 ] ) ? 1 :
( b[ 0 ] === a[ 0 ] ) ? 0 : -1;
}; // descendingOnKey()
// Sort helper to sort an array of `[ key, value ]` in ascending order by **value**.
helpers.array.ascendingOnValue = function ( a, b ) {
return ( a[ 1 ] > b[ 1 ] ) ? 1 :
( a[ 1 ] === b[ 1 ] ) ? 0 : -1;
}; // ascendingOnValue()
// Sort helper to sort an array of `[ key, value ]` in descending order by **value**.
helpers.array.descendingOnValue = function ( a, b ) {
return ( b[ 1 ] > a[ 1 ] ) ? 1 :
( b[ 1 ] === a[ 1 ] ) ? 0 : -1;
}; // descendingOnValue()
// The following two functions generate a suitable function for sorting on a single
// key or on a composite keys (max 2 only). Just a remider, the generated function
// does not sort on two keys; instead it will sort on a key composed of the two
// accessors.
// Sorts in ascending order on `accessor1` & `accessor2` (optional).
helpers.array.ascendingOn = function ( accessor1, accessor2 ) {
if ( accessor2 ) {
return ( function ( a, b ) {
return ( a[ accessor1 ][ accessor2 ] > b[ accessor1 ][ accessor2 ] ) ? 1 :
( a[ accessor1 ][ accessor2 ] === b[ accessor1 ][ accessor2 ] ) ? 0 : -1;
} );
}
return ( function ( a, b ) {
return ( a[ accessor1 ] > b[ accessor1 ] ) ? 1 :
( a[ accessor1 ] === b[ accessor1 ] ) ? 0 : -1;
} );
}; // ascendingOn()
// Sorts in descending order on `accessor1` & `accessor2` (optional).
helpers.array.descendingOn = function ( accessor1, accessor2 ) {
if ( accessor2 ) {
return ( function ( a, b ) {
return ( b[ accessor1 ][ accessor2 ] > a[ accessor1 ][ accessor2 ] ) ? 1 :
( b[ accessor1 ][ accessor2 ] === a[ accessor1 ][ accessor2 ] ) ? 0 : -1;
} );
}
return ( function ( a, b ) {
return ( b[ accessor1 ] > a[ accessor1 ] ) ? 1 :
( b[ accessor1 ] === a[ accessor1 ] ) ? 0 : -1;
} );
}; // descendingOn()
// #### pluck
// Plucks specified element from each element of an **array of array**, and
// returns the resultant array. The element is specified by `i` (default `0`) and
// number of elements to pluck are defined by `limit` (default `a.length`).
helpers.array.pluck = function ( a, key, limit ) {
var k, plucked;
k = a.length;
var i = key || 0;
var lim = limit || k;
if ( lim > k ) lim = k;
plucked = new Array( lim );
for ( k = 0; k < lim; k += 1 ) plucked[ k ] = a[ k ][ i ];
return plucked;
}; // pluck()
// #### product
// Finds the Cartesian Product of arrays present inside the array `a`. Therefore
// the array `a` must be an array of 1-dimensional arrays. For example,
// `product( [ [ 9, 8 ], [ 1, 2 ] ] )`
// will produce `[ [ 9, 1 ], [ 9, 2 ], [ 8, 1 ], [ 8, 2 ] ]`.
helpers.array.product = function ( a ) {
return (
a.reduce( productReducer, [ [] ] )
);
};
// #### shuffle
// Randomly shuffles the elements of an array and returns the same.
// Reference: Chapter on Random Numbers/Shuffling in Seminumerical algorithms.
// The Art of Computer Programming Volume II by Donald E Kunth
helpers.array.shuffle = function ( array ) {
var a = array;
var balance = a.length;
var candidate;
var temp;
while ( balance ) {
candidate = Math.floor( Math.random() * balance );
balance -= 1;
temp = a[ balance ];
a[ balance ] = a[ candidate ];
a[ candidate ] = temp;
}
return ( a );
};
// ### Object Helpers
var objectKeys = Object.keys;
var objectCreate = Object.create;
helpers.object = Object.create( null );
// #### is Object
// Tests if argument `v` is a JS object; returns `true` if it is, otherwise returns `false`.
helpers.object.isObject = function ( v ) {
return ( v && ( Object.prototype.toString.call( v ) === '[object Object]' ) ) ? true : false; // eslint-disable-line no-unneeded-ternary
}; // isObject()
// #### keys
// Returns keys of the `obj` in an array.
helpers.object.keys = function ( obj ) {
return ( objectKeys( obj ) );
}; // keys()
// #### size
// Returns the number of keys of the `obj`.
helpers.object.size = function ( obj ) {
return ( ( objectKeys( obj ) ).length );
}; // size()
// #### values
// Returns all values from each key/value pair of the `obj` in an array.
helpers.object.values = function ( obj ) {
var keys = helpers.object.keys( obj );
var length = keys.length;
var values = new Array( length );
for ( var i = 0; i < length; i += 1 ) {
values[ i ] = obj[ keys[ i ] ];
}
return values;