Skip to content

Commit

Permalink
fixed utf encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
Yoz75 committed Nov 17, 2024
1 parent b8830cd commit 1f35f79
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 37 deletions.
58 changes: 34 additions & 24 deletions generator/randomtexttokenizer.d
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import std.random;

class RandomTextTokenizer : ITextTokenizer
{
private size_t MinLength, MaxLength;
private const size_t MinLength, MaxLength;

public this(size_t minLength, size_t maxLength)
{
Expand All @@ -21,41 +21,51 @@ class RandomTextTokenizer : ITextTokenizer

public Token[] Tokenize(WGString input, size_t tokenValueSize)
{
Token[WGString] tokens = new Token[WGString];
Token[] resultTokens;
//TODO: fix RangeError
Token[WGString] tokensDict = new Token[WGString];
Token[] allTokens;

size_t thisTokenSize;
size_t firstTokenSize;
size_t i = 0;
while (i < input.length)
{
// Генерируем случайную длину токена в диапазоне [minValue, maxValue]
tokenValueSize = uniform(MinLength, MaxLength+ 1);

firstTokenSize = thisTokenSize = uniform(MinLength, MaxLength);
// Проверяем, чтобы токен помещался в оставшуюся часть строки
if (i + tokenValueSize > input.length)
break;

for (size_t i = 0; i < input.length; i += thisTokenSize)
{
WGString tokenValue = input[i.. (i + thisTokenSize)];

Token token;
WGString tokenValue = input[i .. (i + tokenValueSize)];

if (tokens.get(tokenValue, null) !is null)
Token token;
if (tokensDict.get(tokenValue, null) !is null)
{
token = tokens[tokenValue];
token = tokensDict[tokenValue];
}
else
else
{
token = new Token(tokenValue);
tokens[tokenValue] = token;
}
tokensDict[tokenValue] = token;
}

resultTokens ~= token;
allTokens ~= token;

if (i >= firstTokenSize)
if (i >= tokenValueSize)
{
WGString prevTokenValue = input[(i - thisTokenSize)..i];
auto prevToken = tokens[prevTokenValue];
WGString prevTokenValue = input[i - tokenValueSize .. i];
auto prevToken = tokensDict[prevTokenValue];
prevToken.AddNextToken(token);
}
}

thisTokenSize = uniform(MinLength, MaxLength);
return resultTokens;

// Переходим к следующему токену
i += tokenValueSize;
}

if (allTokens.length > 0)
{
allTokens[$ - 1].NextTokens = null;
}

return allTokens;
}
}
2 changes: 1 addition & 1 deletion generator/wgstring.d
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module generator.wgstring;

alias WGString = string;
alias WGString = dstring;
22 changes: 10 additions & 12 deletions main.d
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@

import std.stdio;
import std.algorithm;
import std.file : read, getcwd;
import std.file : readText, getcwd;
import std.conv : to;
import core.sys.windows.windows;
import generator.itexttokenizer;
import generator.texttokenizer;
import generator.randomtexttokenizer;
Expand All @@ -11,17 +12,12 @@ import generator.token;
import generator.textgenerator;
import generator.wgstring;

extern(Windows)
{
bool SetConsoleOutputCP(uint wCodePageID);
}

final abstract class Arguments
{
static:
public immutable string ReadFileName = "source=";
public immutable string TokenSize = "ts=";
public immutable string TokenRandomSizes = "trs=";
//public immutable string TokenRandomSizes = "trs=";
public immutable string TokensGenerate = "tg=";
public immutable string TokensNext = "tn=";
public immutable string TokensRandomChance = "tr=";
Expand All @@ -30,12 +26,13 @@ static:

WGString ReadInputFromConsole()
{
return readln!WGString();
return to!dstring(readln());
}

void main(string[] args)
{
SetConsoleOutputCP(65001);
SetConsoleCP(65001);

ITextTokenizer tokenizer = new TextTokenizer();
size_t tokenSize = 5;
Expand Down Expand Up @@ -79,7 +76,7 @@ void main(string[] args)
{
funRecreationsCount = to!(size_t)(arg[Arguments.TokenSize.length..$]);
}
if(arg.startsWith(Arguments.TokenRandomSizes))
/*if(arg.startsWith(Arguments.TokenRandomSizes))
{
string minValue, maxValue;
minValue = maxValue = "";
Expand Down Expand Up @@ -107,16 +104,17 @@ void main(string[] args)
}
}
tokenizer = new RandomTextTokenizer(to!size_t(minValue), to!size_t(maxValue));
}
} */
}

if(args.length <= 1)

if(!isShallReadFile)
{
input = ReadInputFromConsole();
}
else
{
input = cast(WGString)read(filename);
input = to!WGString(readText(filename));
}

Token[] tokens = tokenizer.Tokenize(input, tokenSize);
Expand Down

0 comments on commit 1f35f79

Please sign in to comment.