Skip to content

Commit 377e4fb

Browse files
committed
Fix out-of-bounds access.
Return nullptr when input text does not contain 'EOS' line.
1 parent 22b936b commit 377e4fb

File tree

8 files changed

+150
-32
lines changed

8 files changed

+150
-32
lines changed

cpp_cli/jdepp-app.cc

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,75 @@
1+
#include <cstring>
2+
#include <iostream>
3+
#include <string>
4+
#include <vector>
5+
16
#define OPTPARSE_IMPLEMENTATION
27
#include "optparse.h"
38

49
#include "pdep.h"
510

6-
int main (int argc, char** argv) {
11+
static void parse_library_options(int &i, const int argc, char **argv,
12+
std::vector<char *> &lib_args) {
13+
if (i < argc) {
14+
printf("i = %d, argc = %d\n", i, argc);
15+
if (std::strcmp(argv[i], "--") == 0) { // library options
16+
i++;
17+
while (i + lib_args.size() < argc &&
18+
std::strcmp(argv[i + lib_args.size()], "--") != 0) {
19+
lib_args.push_back(argv[i + lib_args.size()]);
20+
}
21+
22+
printf("libsize %d\n", lib_args.size());
23+
i += lib_args.size();
24+
lib_args.push_back(nullptr);
25+
} else {
26+
fprintf(stderr, "Type `%s --help' for option details.\n", argv[0]);
27+
exit(-1);
28+
}
29+
}
30+
}
31+
32+
int main(int argc, char **argv) {
33+
std::vector<char *> main_argvs;
34+
std::vector<char *> learner_argvs;
35+
std::vector<char *> chunk_argvs;
36+
std::vector<char *> depend_argvs;
37+
38+
// main args -- [learner args] -- [chunk_args] -- [depend_args]
39+
//
40+
// find library options
41+
int i = 0;
42+
while (i < argc) {
43+
if (std::strcmp(argv[i], "--") == 0) {
44+
break;
45+
}
46+
47+
main_argvs.push_back(argv[i]);
48+
i++;
49+
}
50+
// optparse use in pdep::option only use argv, so must add null terminator otherwise out-of-bounds access happens
51+
main_argvs.push_back(nullptr);
52+
53+
if (i < argc) {
54+
parse_library_options(i, argc, argv, learner_argvs);
55+
parse_library_options(i, argc, argv, chunk_argvs);
56+
parse_library_options(i, argc, argv, depend_argvs);
57+
}
58+
59+
//for (int i = 0; i < main_argvs.size(); i++) {
60+
// std::cout << "main[" << i << "] = " << main_argvs[i] << "\n";
61+
//}
62+
63+
//for (int i = 0; i < learner_argvs.size(); i++) {
64+
// std::cout << "learner[" << i << "] = " << learner_argvs[i] << "\n";
65+
//}
766

8-
pdep::option opt (argc, argv);
9-
pdep::parser parser (opt);
67+
pdep::option opt(int(main_argvs.size()), main_argvs.data(),
68+
int(learner_argvs.size()), learner_argvs.data(),
69+
int(chunk_argvs.size()), chunk_argvs.data(),
70+
int(depend_argvs.size()), depend_argvs.data());
71+
pdep::parser parser(opt);
1072

11-
parser.run ();
73+
parser.run();
1274
return 0;
1375
}

jdepp/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .jdepp_ext import *
1+
from jdepp_ext import *
22

33
# load setptools_scm generated _version.py
44
try:
@@ -14,6 +14,8 @@
1414

1515
from .jdepp_tools import *
1616

17+
__all__ = ['to_tree', 'to_dot']
18+
1719

1820
class Jdepp:
1921
def __init__(self):
@@ -24,6 +26,10 @@ def load_model(self, dict_path: Path):
2426
return self._jdepp.load_model(str(dict_path))
2527

2628
def parse_from_postagged(self, input_postagged: str):
29+
if isinstance(input_postagged, list):
30+
# Assume each line ends with newline '\n'
31+
input_postagged = "".join(input_postagged)
32+
2733
return self._jdepp.parse_from_postagged(input_postagged)
2834

2935
# for internal debugging.

jdepp/jdepp_tools.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals
6363
tag = set (["D", "A", "P", "I"])
6464
ww ={'Na':1, 'W':2, 'F':2, 'H':1, 'A':2, 'N':1}
6565

66+
if isinstance(lines, str):
67+
lines = lines.split('\n')
68+
6669
result = ""
6770
#for line in iter (sys.stdin.readline, ""): # no buffering
6871
for line in lines:
@@ -88,6 +91,7 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals
8891
binfo[-1].morph += pat_s.split (line_, 1)[0]
8992
for b in binfo:
9093
b.len = sum (ww[width (x)] for x in b.morph)
94+
9195
if not quiet or wrong:
9296
text = treeify (binfo)
9397
result += hader

jdepp/optparse.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,19 +152,19 @@ optparse_init(struct optparse *options, char **argv)
152152
static int
153153
optparse_is_dashdash(const char *arg)
154154
{
155-
return arg != 0 && arg[0] == '-' && arg[1] == '-' && arg[2] == '\0';
155+
return arg != 0 && (strlen(arg) == 2) && arg[0] == '-' && arg[1] == '-' && arg[2] == '\0';
156156
}
157157

158158
static int
159159
optparse_is_shortopt(const char *arg)
160160
{
161-
return arg != 0 && arg[0] == '-' && arg[1] != '-' && arg[1] != '\0';
161+
return arg != 0 && (strlen(arg) >= 2) && arg[0] == '-' && arg[1] != '-' && arg[1] != '\0';
162162
}
163163

164164
static int
165165
optparse_is_longopt(const char *arg)
166166
{
167-
return arg != 0 && arg[0] == '-' && arg[1] == '-' && arg[2] != '\0';
167+
return arg != 0 && (strlen(arg) >= 3) && arg[0] == '-' && arg[1] == '-' && arg[2] != '\0';
168168
}
169169

170170
static void

jdepp/pa.h

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -156,15 +156,15 @@ test test file set '-' to skip testing\n\
156156

157157
#ifdef USE_MULTICLASS
158158
#ifdef _OPENMP
159-
static const char* opal_opts = "t:d:kpO:o:l:c:i:asn:b:M:h";
159+
//static const char* opal_opts = "t:d:kpO:o:l:c:i:asn:b:M:h";
160160
#else
161-
static const char* opal_opts = "t:d:kpO:o:l:c:i:asb:M:h";
161+
//static const char* opal_opts = "t:d:kpO:o:l:c:i:asb:M:h";
162162
#endif
163163
#else
164164
#ifdef _OPENMP
165-
static const char* opal_opts = "t:d:kpO:o:l:c:i:asn:b:M:Ph";
165+
//static const char* opal_opts = "t:d:kpO:o:l:c:i:asn:b:M:Ph";
166166
#else
167-
static const char* opal_opts = "t:d:kpO:o:l:c:i:asb:M:Ph";
167+
//static const char* opal_opts = "t:d:kpO:o:l:c:i:asb:M:Ph";
168168
#endif
169169
#endif
170170

@@ -466,7 +466,6 @@ namespace opal {
466466
if (! splitN) // enable shrinkage
467467
shrink = true, splitN = (std::numeric_limits <uint>::max) ();
468468
if (std::strcmp (com, "--") == 0) return;
469-
printf("argc = %d, options.optind + 3 %d\n", argc, options.optind);
470469
if (argc < options.optind + 3) {
471470
printCredit ();
472471
my_errx (1, "Type `%s --help' for option details.", com);
@@ -1016,7 +1015,7 @@ namespace opal {
10161015
_fp = lfn ? std::fopen (lfn, "r") : stdin; // initialize
10171016
if (! _fp) my_errx (1, "no such file: %s", lfn);
10181017
std::setvbuf (_fp, &_buf[0], _IOFBF, BUF_SIZE);
1019-
_lm = lm; _fm = fm; _flag = flag; _M = M;
1018+
_lm = lm; _fm = fm; _flag = flag; _M = M;
10201019
if (_flag) // read data for feature packing / thresholding
10211020
read (_fp, _lm, _fm, _flag, _M), _flag = false;
10221021
}
@@ -1459,14 +1458,14 @@ namespace opal {
14591458
std::fprintf (stderr, "loading..");
14601459
#ifdef USE_STRING_FEATURE
14611460
char* ffn
1462-
= std::strcat (std::strcpy (new char[std::strlen (mfn) + 10], mfn),
1461+
= std::strcat (std::strcpy (new char[std::strlen (mfn) + 10], mfn),
14631462
".features");
14641463
_fm.load (ffn);
14651464
delete [] ffn;
14661465
#endif
14671466
FILE* reader = std::fopen (mfn, "r");
14681467
// examine model type
1469-
if (! reader || std::feof (reader))
1468+
if (! reader || std::feof (reader))
14701469
my_errx (1, "cannot read a model: %s", mfn);
14711470
char buf[BUF_SIZE]; std::setvbuf (reader, &buf[0], _IOFBF, BUF_SIZE);
14721471
const char flag = static_cast <char> (std::fgetc (reader));

jdepp/pdep.cc

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -518,11 +518,16 @@ namespace pdep {
518518
char *r = _s->postagged () + len - 4;
519519
if (len < 4 || *r != 'E' || *(r+1) != 'O' || *(r+2) != 'S' || *(r+3) != '\n') {
520520
fprintf(stderr, "%s", "found a tagged sentence that is not EOS-terminated.");
521+
return nullptr;
521522
}
523+
522524
for (char* p (_s->postagged ()), *q (p); p != r; p = ++q) {
523-
while (q != r && *q != '\n') ++q;
524-
if (! _opt.ignore || std::strncmp (p, _opt.ignore, _opt.ignore_len) != 0)
525-
_s->add_token (p, q - p, _dict);
525+
while ((*q) && q != r && *q != '\n') ++q;
526+
if ((*q)) {
527+
if (! _opt.ignore || std::strncmp (p, _opt.ignore, _opt.ignore_len) != 0) {
528+
_s->add_token (p, q - p, _dict);
529+
}
530+
}
526531
}
527532
_chunk <PARSE> ();
528533
_s->setup (_dict);

jdepp/pdep.h

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,6 @@ static struct optparse_long maxent_long_options[] = {
226226
};
227227
#endif
228228

229-
//extern char* optarg;
230-
//extern int optind;
231229
#endif
232230

233231
// raw strings for utf8
@@ -337,8 +335,8 @@ namespace pdep {
337335

338336
#if 1
339337
//
340-
option (int argc, char** argv) :
341-
com (argv[0]), train ("train.JDP"),
338+
option (int main_argc, char** main_argv, int _learner_argc, char **_learner_argv, int _chunk_argc, char **_chunk_argv, int _depend_argc, char **_depend_argv) :
339+
com (main_argv[0]), train ("train.JDP"),
342340
#ifdef USE_STACKING
343341
model_dir (JDEPP_DEFAULT_MODEL "_stack"),
344342
#else
@@ -355,12 +353,11 @@ namespace pdep {
355353
#ifdef USE_AS_STANDALONE
356354
mecab_dic (MECAB_DICT),
357355
#endif
358-
learner_argv (0), chunk_argv (0), depnd_argv (0), learner_argc (0), chunk_argc (0), depnd_argc (0), verbose (0), ignore (0), ignore_len (0), utf8 (true) {
356+
learner_argv (_learner_argv), chunk_argv (_chunk_argv), depnd_argv (_depend_argv), learner_argc (_learner_argc), chunk_argc (_chunk_argc), depnd_argc (_depend_argc), verbose (0), ignore (0), ignore_len (0), utf8 (true) {
359357
// getOpt
360-
if (argc == 0) return;
361-
//int optind = 1;
358+
if (main_argc == 0) return;
362359
struct optparse options;
363-
optparse_init(&options, argv);
360+
optparse_init(&options, main_argv);
364361
while (1) {
365362
int opt = optparse_long (&options, jdepp_long_options, NULL);
366363
if (opt == -1) break;
@@ -436,6 +433,8 @@ namespace pdep {
436433
if (input == CHUNK && parser != LINEAR) {
437434
my_warnx ("%s", "NOTE: parsing algorithm [-p] is ignored in training a chunker.\n");
438435
}
436+
437+
#if 0
439438
// learner options
440439
if (std::strcmp (argv[options.optind - 1], "--") == 0) --options.optind;
441440

@@ -445,14 +444,15 @@ namespace pdep {
445444
_set_library_options (options, argc, argv, chunk_argc, chunk_argv);
446445
// classifier options for dependency parser
447446
_set_library_options (options, argc, argv, depnd_argc, depnd_argv);
447+
#endif
448448

449449
valid = true;
450450
}
451451
void printCredit () { std::fprintf (stderr, JDEPP_COPYRIGHT, com); }
452452
void printHelp () { std::fprintf (stderr, JDEPP_OPT0 JDEPP_OPT1 JDEPP_OPT_TRAIN JDEPP_OPT_TEST JDEPP_OPT_MISC); }
453453
#endif
454454
private:
455-
#if 1
455+
#if 0
456456
void _set_library_options (struct optparse &options, const int argc, char** argv,
457457
int& largc, char**& largv) {
458458

jdepp/python-binding-jdepp.cc

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,7 @@ class PySentence
590590

591591
class PyJdepp {
592592
public:
593+
const uint32_t kMaxChars = IOBUF_SIZE / 4; // Usually UTF-8 Japanese character is 3-bytes. div by 4 is convervative estimate.
593594
PyJdepp() {}
594595
PyJdepp(const std::string &model_path)
595596
: _model_path(model_path) {
@@ -602,16 +603,22 @@ class PyJdepp {
602603
// Curently we must set all parameters in options as (argc, argv), then construct parser.
603604
//
604605

605-
std::cout << "model_path " << model_path << "\n";
606+
//std::cout << "model_path " << model_path << "\n";
606607
_argv_str.push_back("pyjdepp");
607-
_argv_str.push_back("--verbose");
608-
_argv_str.push_back("10");
608+
//_argv_str.push_back("--verbose");
609+
//_argv_str.push_back("10");
609610
_argv_str.push_back("-m");
610611
_argv_str.push_back(model_path);
611612

613+
// TODO: args for learner, chunker, depend
614+
612615
setup_argv();
613616

614-
pdep::option options(int(_argv.size()), _argv.data());
617+
pdep::option options(int(_argv.size()), _argv.data(),
618+
int(_learner_argv.size()), _learner_argv.data(),
619+
int(_chunk_argv.size()), _chunk_argv.data(),
620+
int(_depend_argv.size()), _depend_argv.data()
621+
);
615622

616623
if (_parser && _parser->model_loaded()) {
617624
// discard previous model&instance.
@@ -634,7 +641,16 @@ class PyJdepp {
634641
return PySentence();
635642
}
636643

644+
if (input_postagged.size() > IOBUF_SIZE) {
645+
py::print("Input too large. Input bytes must be less than ", IOBUF_SIZE);
646+
return PySentence();
647+
}
648+
637649
const pdep::sentence_t *sent = _parser->parse_from_postagged(input_postagged.c_str(), input_postagged.size());
650+
if (!sent) {
651+
py::print("Failed to parse text from POS tagged string");
652+
return PySentence();
653+
}
638654

639655
PySentence pysent;
640656

@@ -722,6 +738,22 @@ class PyJdepp {
722738
for (auto &v : _argv_str) {
723739
_argv.push_back(const_cast<char *>(v.c_str()));
724740
}
741+
_argv.push_back(nullptr); // must add 'nullptr' at the end, otherwise out-of-bounds access will happen in optparse
742+
743+
for (auto &v : _learner_argv_str) {
744+
_learner_argv.push_back(const_cast<char *>(v.c_str()));
745+
}
746+
_learner_argv.push_back(nullptr);
747+
748+
for (auto &v : _chunk_argv_str) {
749+
_chunk_argv.push_back(const_cast<char *>(v.c_str()));
750+
}
751+
_chunk_argv.push_back(nullptr);
752+
753+
for (auto &v : _depend_argv_str) {
754+
_depend_argv.push_back(const_cast<char *>(v.c_str()));
755+
}
756+
_depend_argv.push_back(nullptr);
725757
}
726758

727759
uint32_t _nthreads{0}; // 0 = use all cores
@@ -731,6 +763,15 @@ class PyJdepp {
731763
std::vector<char *> _argv;
732764
std::vector<std::string> _argv_str;
733765

766+
std::vector<char *> _learner_argv;
767+
std::vector<std::string> _learner_argv_str;
768+
769+
std::vector<char *> _chunk_argv;
770+
std::vector<std::string> _chunk_argv_str;
771+
772+
std::vector<char *> _depend_argv;
773+
std::vector<std::string> _depend_argv_str;
774+
734775
};
735776

736777
} // namespace pyjdepp
@@ -743,6 +784,7 @@ PYBIND11_MODULE(jdepp_ext, m) {
743784
py::class_<pyjdepp::PyJdepp>(m, "JdeppExt")
744785
.def(py::init<>())
745786
//.def(py::init<std::string>())
787+
.def_readonly("MAX_CHARS", &pyjdepp::PyJdepp::kMaxChars)
746788
.def("load_model", &pyjdepp::PyJdepp::load_model)
747789
.def("parse_from_postagged", [](const pyjdepp::PyJdepp &self, const std::string &str) {
748790
//if (!self.model_loaded()) {

0 commit comments

Comments
 (0)