Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,5 @@ node_modules/
# testing
/output
/vector_data
test.py
test.py
test.ipynb
57 changes: 53 additions & 4 deletions src/team_comm_tools/features/keywords.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# reference: https://github.com/bbevis/politenessPy/blob/main/keywords.py
kw = {
"spacy_neg_only": {
"Negative_Emotion": [
Expand Down Expand Up @@ -7260,7 +7261,34 @@
" sorry ",
" woops ",
" whoops ",
" oops "
" oops ",
" apology "
],
"Third_Person": [
" he ",
" him ",
" his ",
" himself ",
" she ",
" her ",
" hers ",
" herself ",
" they ",
" them ",
" their ",
" theirs ",
" themselves "
],
"Contrast_Conjunction": [
" but ",
" however ",
" instead ",
" although ",
" even though ",
" despite ",
" and yet ",
" nevertheless ",
" nonetheless "
],
"Ask_Agency": [
" do me a favor ",
Expand Down Expand Up @@ -7365,7 +7393,6 @@
"Gratitude": [
" thank ",
" thanks ",
" thank you ",
" grateful ",
" gratitude ",
" cheers "
Expand Down Expand Up @@ -14419,25 +14446,47 @@
" cock ",
" crap ",
" damn ",
" dammit ",
" damnit ",
" dick ",
" dickhead ",
" dick-head ",
" dumb ",
" dumbass ",
" dumb-ass ",
" dumb ass ",
" dyke ",
" fuck ",
" fucking ",
" fucker ",
" goddam ",
" goddammit ",
" goddamed ",
" hell ",
" horshit ",
" homo ",
" jackass ",
" jackass ",
" motherfucker ",
" mother-fucker ",
" motherfucking ",
" nigger ",
" nigra ",
" piss ",
" prick ",
" pussy ",
" queer ",
" screw ",
" shit ",
" shite ",
" shitting ",
" sob ",
" sonofa ",
" suck ",
" sucked ",
" sucks "
" sucks ",
" twat ",
" wanker ",
" whore "
],
"Truth_Intensifier": [
" really ",
Expand Down
265 changes: 243 additions & 22 deletions src/team_comm_tools/features/politeness_v2_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,32 +215,253 @@ def bare_command(doc):
return len(bc)


def is_in_subordinate_clause(tok, sent):
"""
Check if a token is inside a subordinate clause rather than the main clause.
"""
# Walk up from the token's head (not the token itself)
current = tok
while current.head != current and current != sent.root:
# Check if the HEAD has a subordinate clause dependency
if current.head.dep_ in {"advcl", "relcl", "acl", "ccomp", "xcomp"} and current.head != sent.root:
# We're attached to something that's a subordinate clause
return True
current = current.head
return False

def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False):
"""
Returns True if the WH-word token is part of a real main-clause question.
"""
# Check if sentence starts with auxiliary (not WH-word) - indicates Yes/No question
sent_tokens = list(sent)
if len(sent_tokens) >= 1:
first_tok = sent_tokens[0]
if first_tok.text.lower() in auxiliaries and first_tok.text.lower() not in {'what', 'who', 'where', 'when', 'why', 'how', 'which'}:
return False

# For WH-determiners (both with and without ?), use special logic
if tok.dep_ == "det":
noun = tok.head

# Check: is the noun inside a complement clause?
current = noun
while current.head != current and current != sent.root:
if current.dep_ in {"ccomp", "xcomp"}:
return False
current = current.head

# If the noun is a subject (nsubj) and has a relcl ancestor, it's likely a misparsed question
if noun.dep_ in {"nsubj", "nsubjpass"}:
# This looks like a question with the WH-noun as subject
# Check for auxiliary
for t in sent:
if t.i > noun.i and t.text.lower() in auxiliaries:
return True

# If sentence ends with ?, accept it
if ends_with_question_mark:
return True

# For non-subject WH-determiners, check close ancestors for relcl
if tok.dep_ == "det" and tok.head.dep_ != "relcl":
# Check head and head's head
if tok.head.head.dep_ == "relcl" and tok.head.head.i < tok.i:
# relcl is before WH-word, likely a real relative clause
return False

# Check if there's an auxiliary after the WH-word/noun
for t in sent:
if t.i > noun.i and t.text.lower() in auxiliaries:
return True

return False

# For other WH-words (not determiners)

# Special handling for WH-words that are subjects (nsubj, nsubjpass)
if tok.dep_ in {"nsubj", "nsubjpass"}:
# First check: is this part of a relative clause?
has_relcl_ancestor = False
for anc in tok.ancestors:
if anc.dep_ == "relcl":
has_relcl_ancestor = True
break

if has_relcl_ancestor:
# Check if there's a noun before the WH-word (ignoring punctuation)
# This is the typical relative clause pattern: "the book, which..."
has_noun_before = False
for t in sent:
if t.i >= tok.i:
break
if t.pos_ in {"NOUN", "PROPN"}:
has_noun_before = True

# If there's a noun before WH and it has relcl ancestor, it's a real relative clause
if has_noun_before:
return False

# Check if there's actually a complement-taking verb before the WH-word
complement_taking_verbs = {
'tell', 'ask', 'know', 'wonder', 'understand', 'explain',
'show', 'see', 'remember', 'forget', 'realize', 'figure',
'decide', 'consider', 'discover', 'find', 'learn', 'teach'
}

has_complement_verb_before = False
for t in sent:
if t.i >= tok.i:
break
if t.pos_ == "VERB" and t.lemma_ in complement_taking_verbs:
has_complement_verb_before = True
break

# If no complement-taking verb before WH-word, it's a main question
if not has_complement_verb_before:
# Check for an auxiliary after the WH-word
for t in sent:
if t.i > tok.i and t.text.lower() in auxiliaries:
return True
return False
# If there IS a complement verb, fall through to normal checks

# First check for complement clauses (ccomp, xcomp) - these are embedded questions
for anc in tok.ancestors:
if anc.dep_ in {"ccomp", "xcomp"}:
return False

# Check if WH-word is attached to a verb that takes interrogative complements
complement_taking_verbs = {
'tell', 'ask', 'know', 'wonder', 'understand', 'explain',
'show', 'see', 'remember', 'forget', 'realize', 'figure',
'decide', 'consider', 'discover', 'find', 'learn', 'teach'
}

if tok.head.pos_ == "VERB" and tok.head.lemma_ in complement_taking_verbs:
# Check if there are tokens before this verb (indicating it's not sentence-initial)
tokens_before_verb = 0
for t in sent:
if t.i >= tok.head.i:
break
if t.pos_ not in {"PUNCT", "INTJ"}:
tokens_before_verb += 1

# If there are 2+ tokens before the verb, WH is likely embedded
if tokens_before_verb >= 2:
return False

# Check if has relcl ancestor
has_relcl_ancestor = False
for anc in tok.ancestors:
if anc.dep_ == "relcl":
has_relcl_ancestor = True
break

if has_relcl_ancestor:
# Check if this is a misparsed main question vs real relative clause
# Count substantive tokens before the WH-word
substantive_before = 0
for t in sent:
if t.i >= tok.i:
break
if t.pos_ not in {"INTJ", "PUNCT", "CCONJ", "DET"}:
substantive_before += 1

# If fewer than 3 substantive tokens before WH, likely a misparsed main question
if substantive_before < 3:
pass # Don't exclude it
else:
# Likely a real relative clause
return False

# If the sentence ends with ?, be lenient for non-relcl WH-words
if ends_with_question_mark:
return True

# For non-? sentences with non-determiner WH-words that are NOT nsubj
# (nsubj was already handled above)
if is_in_subordinate_clause(tok, sent):
return False

if tok.dep_ not in {"nsubj", "nsubjpass", "csubj", "attr", "ROOT", "dobj", "pobj", "advmod"}:
return False

for t in sent:
if not is_in_subordinate_clause(t, sent) and t.text.lower() in auxiliaries:
return True

return False

def Question(doc):
"""
Counts the number of sentences containing question words and question marks.

Args:
doc (spacy.tokens.Doc): The spaCy Doc object containing the text to be analyzed.

Returns:
tuple: A tuple containing the counts of Yes/No questions and WH-questions.
"""

keywords = set([' who ', ' what ', ' where ', ' when ', ' why ', ' how ', ' which '])
tags = set(['WRB', 'WP', 'WDT'])

# doc = nlp(text)
sentences = [str(sent) for sent in doc.sents if '?' in str(sent)]
all_qs = len(sentences)

n = 0
for i in range(len(sentences)):
whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags]

if len(whq) > 0:
n += 1

return all_qs - n, n
search_tags = {'WRB', 'WP', 'WDT', 'WP$'}
wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which', 'whose', 'whom'}

auxiliaries = {
'do', 'does', 'did', 'have', 'has', 'had',
'can', 'could', 'will', 'would',
'may', 'might', 'shall', 'should',
'is', 'are', 'was', 'were', 'am'
}
pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it', 'these', 'those', 'this', 'that'}

wh_count = 0
yesno_count = 0
counted_sentences = set()

for sent in doc.sents:
sent_text = sent.text.strip()
sent_tokens = list(sent)
if not sent_tokens:
continue

# Method 1: Sentences ending with '?'
if sent_text.endswith('?'):
wh = False
for tok1 in sent_tokens:
t1_lower = tok1.text.lower()
if t1_lower in wh_words and tok1.tag_ in search_tags:
if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=True):
wh = True
break
if wh:
wh_count += 1
else:
yesno_count += 1
counted_sentences.add(sent.start)
continue

# Method 2: Lexical rule-based detection for sentences without '?'
found_question = False
for tok1 in sent_tokens:
t1_lower = tok1.text.lower()
if t1_lower in wh_words and tok1.tag_ in search_tags:
if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=False):
wh_count += 1
counted_sentences.add(sent.start)
found_question = True
break

if found_question:
continue

# Check for Yes/No questions
for tok1, tok2 in zip(sent_tokens, sent_tokens[1:] + [None]):
t1_lower = tok1.text.lower()
t2_lower = tok2.text.lower() if tok2 else None

if tok1.i - sent.start > 1:
continue

if t1_lower in auxiliaries and t2_lower in pronoun_followers:
yesno_count += 1
counted_sentences.add(sent.start)
break

return yesno_count, wh_count


def word_start(keywords, doc):
Expand Down
Loading