99 % case-insensitive BERTTokenizer using the file vocabFile as
1010 % the vocabulary.
1111 %
12- % tokenizer = BERTTokenizer(vocabFile,'IgnoreCase',tf)
13- % Constructs a BERTTokenizer which is case-sensitive or not
14- % according to the scalar logical tf. The default is true.
12+ % tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
13+ % specifies the optional parameter name/value pairs:
14+ %
15+ % 'IgnoreCase' - A logical value to control if the
16+ % BERTTokenizer is case sensitive or not.
17+ % The default value is true.
18+ %
19+ % 'FullTokenizer' - The underlying word-piece tokenizer.
20+ % If not specified, a default
21+ % FullTokenizer is constructed.
1522 %
1623 % BERTTokenizer properties:
1724 % FullTokenizer - The underlying word-piece tokenizer.
3441 % tokenizer = bert.tokenizer.BERTTokenizer();
3542 % sequences = tokenizer.encode("Hello World!")
3643
37- % Copyright 2021 The MathWorks, Inc.
44+ % Copyright 2021-2023 The MathWorks, Inc.
3845
3946 properties (Constant )
4047 PaddingToken = " [PAD]"
6370 % case-insensitive BERTTokenizer using the file vocabFile as
6471 % the vocabulary.
6572 %
66- % tokenizer = BERTTokenizer(vocabFile,'IgnoreCase',tf)
67- % Constructs a BERTTokenizer which is case-sensitive or not
68- % according to the scalar logical tf. The default is true.
73+ % tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
74+ % specifies the optional parameter name/value pairs:
75+ %
76+ % 'IgnoreCase' - A logical value to control if the
77+ % BERTTokenizer is case sensitive or not.
78+ % The default value is true.
79+ %
80+ % 'FullTokenizer' - The underlying word-piece tokenizer.
81+ % If not specified, a default
82+ % FullTokenizer is constructed.
6983 %
7084 % BERTTokenizer properties:
7185 % FullTokenizer - The underlying word-piece tokenizer.
90104 arguments
91105 vocabFile (1 ,1 ) string {mustBeFile } = bert.internal.getSupportFilePath(" base" ," vocab.txt" )
92106 nvp.IgnoreCase (1 ,1 ) logical = true
107+ nvp.FullTokenizer = []
108+ end
109+ if isempty(nvp .FullTokenizer )
110+ ignoreCase = nvp .IgnoreCase ;
111+ this.FullTokenizer = bert .tokenizer .internal .FullTokenizer(vocabFile ,' IgnoreCase' ,ignoreCase );
112+ else
113+ mustBeA(nvp .FullTokenizer ,' bert.tokenizer.internal.FullTokenizer' );
114+ this.FullTokenizer = nvp .FullTokenizer ;
93115 end
94- ignoreCase = nvp .IgnoreCase ;
95- this.FullTokenizer = bert .tokenizer .internal .FullTokenizer(vocabFile ,' IgnoreCase' ,ignoreCase );
96116 this.PaddingCode = this .FullTokenizer .encode(this .PaddingToken );
97117 this.SeparatorCode = this .FullTokenizer .encode(this .SeparatorToken );
98118 this.StartCode = this .FullTokenizer .encode(this .StartToken );
131151 inputShape = size(text_a );
132152 text_a = reshape(text_a ,[],1 );
133153 text_b = reshape(text_b ,[],1 );
134- tokenize = @(text ) this .FullTokenizer .tokenize(text );
135- tokens = arrayfun(tokenize ,text_a ,' UniformOutput' ,false );
154+ tokens = this .FullTokenizer .tokenize(text_a );
136155 if ~isempty(text_b )
137- tokens_b = arrayfun( tokenize , text_b , ' UniformOutput ' , false );
156+ tokens_b = this . FullTokenizer . tokenize( text_b );
138157 tokens = cellfun(@(tokens_a ,tokens_b ) [tokens_a ,this .SeparatorToken ,tokens_b ], tokens , tokens_b , ' UniformOutput' , false );
139158 end
140159 tokens = cellfun(@(tokens ) [this .StartToken , tokens , this .SeparatorToken ], tokens , ' UniformOutput' , false );
218237 text = cellfun(@(x ) join(x ," " ), tokens );
219238 end
220239 end
221- end
240+ end
0 commit comments