Skip to content

Commit

Permalink
fix split function #82 (#92)
Browse files Browse the repository at this point in the history
Co-authored-by: Fadl <chaos@efqr.dev>
  • Loading branch information
himynamesdave and fqrious authored Nov 22, 2024
1 parent 6140c58 commit 6473e78
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 17 deletions.
29 changes: 13 additions & 16 deletions txt2stix/pattern/extractors/base_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import re
import logging
from typing import Iterable

logger = logging.getLogger(__name__)

Expand All @@ -11,14 +12,14 @@
class BaseExtractor:
name = None
extraction_regex = None
stripe_on_line = False
extraction_function = None
common_strip_elements = "\"'.,:"
filter_function = None # further filter the extracted values
meta_extractor = None
version = None
stix_mapping = None
invalid_characters = ['.', ',', '!', '`', '(', ')', '{', '}', '"', '````', ' ']
invalid_characters = ['.', ',', '!', '`', '(', ')', '{', '}', '"', '````', ' ', '[', ']']
SPLITS_FINDER = re.compile(r'[\'"<\(\{\[\s].*?[\)\s\]\}\)>"\']') #split on boundary characters instead of ' ' only


@classmethod
Expand All @@ -39,7 +40,7 @@ def extract_extraction_from_text(cls, text: str):
start_index = 0
if cls.extraction_regex is not None:
if cls.extraction_regex.startswith("^") or cls.extraction_regex.endswith("$"):
for word in text.split():
for word in cls.split_all(text):
end_index = start_index + len(word) - 1
match = re.match(cls.extraction_regex, word)
if match:
Expand All @@ -63,10 +64,8 @@ def extract_extraction_from_text(cls, text: str):
elif cls.extraction_function is not None:

start_index = 0
if cls.stripe_on_line:
words = text.splitlines()
else:
words = text.split()

words = cls.SPLITS_FINDER.findall(text)
for word in words:
end_index = start_index + len(word) - 1

Expand Down Expand Up @@ -119,13 +118,11 @@ def search_keyword_positions(input_string, keyword):
positions_only = [pos for kw, pos in keyword_positions]
return keyword, positions_only

@staticmethod
def trim_invalid_characters(keyword, characters):
if keyword[-1] in characters:
keyword = keyword[:-1]

if len(keyword) > 0:
if keyword[0] in characters:
keyword = keyword[1:]
@classmethod
def split_all(cls, text):
for word in cls.SPLITS_FINDER.findall(text):
yield cls.trim_invalid_characters(word, cls.invalid_characters)

return keyword
@classmethod
def trim_invalid_characters(cls, keyword: str, characters: Iterable):
return keyword.strip(''.join(characters))
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class PhoneNumberExtractor(BaseExtractor):
"""

name = "pattern_phone_number"
extraction_regex = r'(\+\d{1,3}\s?\d{1,4}\s?\d{1,4}\s?\d{1,4})'
extraction_regex = r'((\+|00)\d{1,3}\s?\d{1,4}\s?\d{1,4}\s?\d{1,4})'

@staticmethod
def validate_phone_number(regex, phone_number):
Expand Down

0 comments on commit 6473e78

Please sign in to comment.