@@ -63,19 +63,23 @@ static public function getFreq(array $ngrams, string $sep = ' ') : array
6363 //creates an array of tokens per ngram
6464 $ ngramsArray = self ::ngramsAsArray ($ sep , $ ngrams );
6565
66+ $ ngramSize = count ($ ngramsArray [0 ]);
67+
68+ $ tokens_frequencies = self ::readFreq ($ sep , $ ngrams );
69+ $ combo_frequencies = self ::readCombFreq ($ sep , $ ngrams );
70+
6671 //interate the array with no repeated ngrams
6772 foreach ($ ngramsUnique as $ ngramString => $ ngramFrequency ) {
6873 $ ngramsFinal [$ ngramString ] = array ($ ngramFrequency ); //putting into the final array an array of frequencies (first, the ngram frequency)
6974
7075 $ ngramArray = explode ($ sep , $ ngramString ); //getting an array of tokens of the ngram
71- $ ngramSize = count ($ ngramArray ); //getting the size of ngram
7276 foreach ($ ngramArray as $ kToken => $ token ) { //iterating the array of tokens of the ngram
73- $ ngramsFinal [$ ngramString ][$ kToken +1 ] = self :: countFreq ( $ ngramsArray , $ token, $ kToken) ; //getting the frequency of the token
77+ $ ngramsFinal [$ ngramString ][$ kToken +1 ] = $ tokens_frequencies [ $ token][ $ kToken] ; //getting the frequency of the token
7478
7579 if ($ ngramSize > 2 ) {
7680 //getting the combined frequency of the tokens
7781 for ($ i = $ kToken +1 ; $ i < $ ngramSize ; $ i ++) {
78- $ ngramsFinal [$ ngramString ][$ ngramSize +$ kToken +$ i ] = self :: countFreq ( $ ngramsArray , $ token, $ kToken , $ ngramArray [$ i ], $ i ) ;
82+ $ ngramsFinal [$ ngramString ][$ ngramSize +$ kToken +$ i ] = $ combo_frequencies [ $ token. $ sep . $ ngramArray [$ i ]][ $ kToken ][ $ i ] ;
7983 }
8084 }
8185 }
@@ -86,32 +90,50 @@ static public function getFreq(array $ngrams, string $sep = ' ') : array
8690 }
8791
8892 /**
89- * Count the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
90- * @param array $ngramsArray
91- * @param string $str1
92- * @param int $pos1
93- * @param string $str2
94- * @param int $pos2
95- * @return int $count return the frequency
93+ * Counts the frequency of each token of an ngram array
94+ * @param string $sep
95+ * @param array $ngrams
96+ * @return array $frequencies Return an array of tokens with its frequencies by its positions
9697 */
97- static private function countFreq ( array $ ngramsArray , string $ str1 , int $ pos1 , string $ str2 = null , int $ pos2 = null ) : int
98+ static public function readFreq ( string $ sep , array $ ngrams ) : array
9899 {
99- $ count = 0 ;
100-
101- //counts the number of times the given string(s) to the given position(s) occurs in the given ngrams array.
102- foreach ($ ngramsArray as $ ngramArray ) {
103- if ($ str1 === $ ngramArray [$ pos1 ]) {
104- if (isset ($ str2 ) && isset ($ pos2 )) {
105- if ($ str2 === $ ngramArray [$ pos2 ]) {
106- $ count ++;
107- }
100+ $ ngrams = self ::ngramsAsArray ($ sep , $ ngrams );
101+ $ frequencies = array ();
102+ foreach ($ ngrams as $ ngram ) {
103+ foreach ($ ngram as $ pos => $ token ) {
104+ if (isset ($ frequencies [$ token ][$ pos ])) { //checks if the token in that position was already counted
105+ $ frequencies [$ token ][$ pos ] += 1 ;
108106 } else {
109- $ count ++ ;
107+ $ frequencies [ $ token ][ $ pos ] = 1 ;
110108 }
111109 }
112110 }
111+ return $ frequencies ;
112+ }
113+
114+ /**
115+ * Counts the frequency of combo of tokens of an ngram array
116+ * @param string $sep
117+ * @param array $ngrams
118+ * @return array $frequencies Return an array of a combo of tokens with its frequencies by its positions
119+ */
120+ static public function readCombFreq (string $ sep , array $ ngrams ) : array
121+ {
122+ $ ngrams = self ::ngramsAsArray ($ sep , $ ngrams );
123+ $ frequencies = array ();
124+ foreach ($ ngrams as $ ngram ) {
125+ foreach ($ ngram as $ posToken => $ token ) {
126+ for ($ i = $ posToken +1 ; $ i < count ($ ngram ); $ i ++) {
127+ if (isset ($ frequencies [$ token .$ sep .$ ngram [$ i ]][$ posToken ][$ i ])) { //checks if the combo already exists
128+ $ frequencies [$ token .$ sep .$ ngram [$ i ]][$ posToken ][$ i ] += 1 ;
129+ } else {
130+ $ frequencies [$ token .$ sep .$ ngram [$ i ]][$ posToken ][$ i ] = 1 ;
131+ }
132+ }
133+ }
113134
114- return $ count ;
135+ }
136+ return $ frequencies ;
115137 }
116138
117139 /**
0 commit comments