forked from eulerto/pg_similarity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.h
48 lines (39 loc) · 1.1 KB
/
tokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/*----------------------------------------------------------------------------
*
* tokenizer.h
*
* Copyright (c) 2008-2020, Euler Taveira de Oliveira
*
*----------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#define PGS_MAX_TOKEN_LEN 1024
#define PGS_GRAM_LEN 3
#define PGS_BLANK_CHAR ' '
#define PGS_FULL_NGRAM
typedef struct Token
{
char *data; /* token data */
int freq; /* frequency */
struct Token *next; /* next token */
} Token;
typedef struct TokenList
{
int isset; /* is a set? */
int size; /* list size */
Token *head; /* first token */
Token *tail; /* last token */
} TokenList;
TokenList *initTokenList(int isset);
void destroyTokenList(TokenList *t);
int addToken(TokenList *t, char *s);
int removeToken(TokenList *t);
Token *searchToken(TokenList *t, char *s);
void printToken(TokenList *t);
void tokenizeByNonAlnum(TokenList *t, char *s);
void tokenizeBySpace(TokenList *t, char *s);
void tokenizeByGram(TokenList *t, char *s);
void tokenizeByCamelCase(TokenList *t, char *s);