Watts-Lab · xehu · Jan 12, 2026 · Dec 16, 2025 · Dec 16, 2025 · Jan 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -58,4 +58,5 @@ node_modules/
 # testing
 /output
 /vector_data
-test.py
+test.py
+test.ipynb
diff --git a/src/team_comm_tools/features/keywords.py b/src/team_comm_tools/features/keywords.py
@@ -1,3 +1,4 @@
+# reference: https://github.com/bbevis/politenessPy/blob/main/keywords.py
 kw = {
     "spacy_neg_only": {
         "Negative_Emotion": [
@@ -7260,7 +7261,34 @@
             " sorry ",
             " woops ",
             " whoops ",
-            " oops "
+            " oops ",
+            " apology "
+        ],
+        "Third_Person": [
+            " he ",
+            " him ",
+            " his ",
+            " himself ",
+            " she ",
+            " her ",
+            " hers ",
+            " herself ",
+            " they ",
+            " them ",
+            " their ",
+            " theirs ",
+            " themselves "
+        ],
+        "Contrast_Conjunction": [
+            " but ",
+            " however ",
+            " instead ",
+            " although ",
+            " even though ",
+            " despite ",
+            " and yet ",
+            " nevertheless ",
+            " nonetheless "
         ],
         "Ask_Agency": [
             " do me a favor ",
@@ -7365,7 +7393,6 @@
         "Gratitude": [
             " thank ",
             " thanks ",
-            " thank you ",
             " grateful ",
             " gratitude ",
             " cheers "
@@ -14419,25 +14446,47 @@
             " cock ",
             " crap ",
             " damn ",
+            " dammit ",
+            " damnit ",
             " dick ",
+            " dickhead ",
+            " dick-head ",
             " dumb ",
+            " dumbass ",
+            " dumb-ass ",
+            " dumb ass ",
             " dyke ",
             " fuck ",
+            " fucking ",
+            " fucker ",
             " goddam ",
+            " goddammit ",
+            " goddamed ",
             " hell ",
+            " horshit ",
             " homo ",
+            " jackass ",
+            " jackass ",
+            " motherfucker ",
+            " mother-fucker ",
+            " motherfucking ",
             " nigger ",
+            " nigra ",
             " piss ",
             " prick ",
             " pussy ",
             " queer ",
             " screw ",
             " shit ",
+            " shite ",
+            " shitting ",
             " sob ",
-            " sonofa ",
             " suck ",
             " sucked ",
-            " sucks "
+            " sucks ",
+            " twat ",
+            " wanker ",
+            " whore "
         ],
         "Truth_Intensifier": [
             " really ",

diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py
@@ -215,32 +215,253 @@ def bare_command(doc):
     return len(bc)
 
 
+def is_in_subordinate_clause(tok, sent):
+    """
+    Check if a token is inside a subordinate clause rather than the main clause.
+    """
+    # Walk up from the token's head (not the token itself)
+    current = tok
+    while current.head != current and current != sent.root:
+        # Check if the HEAD has a subordinate clause dependency
+        if current.head.dep_ in {"advcl", "relcl", "acl", "ccomp", "xcomp"} and current.head != sent.root:
+            # We're attached to something that's a subordinate clause
+            return True
+        current = current.head
+    return False
+
+def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False):
+    """
+    Returns True if the WH-word token is part of a real main-clause question.
+    """
+    # Check if sentence starts with auxiliary (not WH-word) - indicates Yes/No question
+    sent_tokens = list(sent)
+    if len(sent_tokens) >= 1:
+        first_tok = sent_tokens[0]
+        if first_tok.text.lower() in auxiliaries and first_tok.text.lower() not in {'what', 'who', 'where', 'when', 'why', 'how', 'which'}:
+            return False
+
+    # For WH-determiners (both with and without ?), use special logic
+    if tok.dep_ == "det":
+        noun = tok.head
+
+        # Check: is the noun inside a complement clause?
+        current = noun
+        while current.head != current and current != sent.root:
+            if current.dep_ in {"ccomp", "xcomp"}:
+                return False
+            current = current.head
+
+        # If the noun is a subject (nsubj) and has a relcl ancestor, it's likely a misparsed question
+        if noun.dep_ in {"nsubj", "nsubjpass"}:
+            # This looks like a question with the WH-noun as subject
+            # Check for auxiliary
+            for t in sent:
+                if t.i > noun.i and t.text.lower() in auxiliaries:
+                    return True
+
+            # If sentence ends with ?, accept it
+            if ends_with_question_mark:
+                return True
+
+        # For non-subject WH-determiners, check close ancestors for relcl
+        if tok.dep_ == "det" and tok.head.dep_ != "relcl":
+            # Check head and head's head
+            if tok.head.head.dep_ == "relcl" and tok.head.head.i < tok.i:
+                # relcl is before WH-word, likely a real relative clause
+                return False
+
+        # Check if there's an auxiliary after the WH-word/noun
+        for t in sent:
+            if t.i > noun.i and t.text.lower() in auxiliaries:
+                return True
+
+        return False
+
+    # For other WH-words (not determiners)
+
+    # Special handling for WH-words that are subjects (nsubj, nsubjpass)
+    if tok.dep_ in {"nsubj", "nsubjpass"}:
+        # First check: is this part of a relative clause?
+        has_relcl_ancestor = False
+        for anc in tok.ancestors:
+            if anc.dep_ == "relcl":
+                has_relcl_ancestor = True
+                break
+
+        if has_relcl_ancestor:
+            # Check if there's a noun before the WH-word (ignoring punctuation)
+            # This is the typical relative clause pattern: "the book, which..."
+            has_noun_before = False
+            for t in sent:
+                if t.i >= tok.i:
+                    break
+                if t.pos_ in {"NOUN", "PROPN"}:
+                    has_noun_before = True
+
+            # If there's a noun before WH and it has relcl ancestor, it's a real relative clause
+            if has_noun_before:
+                return False
+
+        # Check if there's actually a complement-taking verb before the WH-word
+        complement_taking_verbs = {
+            'tell', 'ask', 'know', 'wonder', 'understand', 'explain', 
+            'show', 'see', 'remember', 'forget', 'realize', 'figure',
+            'decide', 'consider', 'discover', 'find', 'learn', 'teach'
+        }
+
+        has_complement_verb_before = False
+        for t in sent:
+            if t.i >= tok.i:
+                break
+            if t.pos_ == "VERB" and t.lemma_ in complement_taking_verbs:
+                has_complement_verb_before = True
+                break
+
+        # If no complement-taking verb before WH-word, it's a main question
+        if not has_complement_verb_before:
+            # Check for an auxiliary after the WH-word
+            for t in sent:
+                if t.i > tok.i and t.text.lower() in auxiliaries:
+                    return True
+            return False
+        # If there IS a complement verb, fall through to normal checks
+
+        # First check for complement clauses (ccomp, xcomp) - these are embedded questions
+        for anc in tok.ancestors:
+            if anc.dep_ in {"ccomp", "xcomp"}:
+                return False
+
+    # Check if WH-word is attached to a verb that takes interrogative complements
+    complement_taking_verbs = {
+        'tell', 'ask', 'know', 'wonder', 'understand', 'explain', 
+        'show', 'see', 'remember', 'forget', 'realize', 'figure',
+        'decide', 'consider', 'discover', 'find', 'learn', 'teach'
+    }
+
+    if tok.head.pos_ == "VERB" and tok.head.lemma_ in complement_taking_verbs:
+        # Check if there are tokens before this verb (indicating it's not sentence-initial)
+        tokens_before_verb = 0
+        for t in sent:
+            if t.i >= tok.head.i:
+                break
+            if t.pos_ not in {"PUNCT", "INTJ"}:
+                tokens_before_verb += 1
+
+        # If there are 2+ tokens before the verb, WH is likely embedded
+        if tokens_before_verb >= 2:
+            return False
+
+    # Check if has relcl ancestor
+    has_relcl_ancestor = False
+    for anc in tok.ancestors:
+        if anc.dep_ == "relcl":
+            has_relcl_ancestor = True
+            break
+
+    if has_relcl_ancestor:
+        # Check if this is a misparsed main question vs real relative clause
+        # Count substantive tokens before the WH-word
+        substantive_before = 0
+        for t in sent:
+            if t.i >= tok.i:
+                break
+            if t.pos_ not in {"INTJ", "PUNCT", "CCONJ", "DET"}:
+                substantive_before += 1
+
+        # If fewer than 3 substantive tokens before WH, likely a misparsed main question
+        if substantive_before < 3:
+            pass  # Don't exclude it
+        else:
+            # Likely a real relative clause
+            return False
+
+    # If the sentence ends with ?, be lenient for non-relcl WH-words
+    if ends_with_question_mark:
+        return True
+
+    # For non-? sentences with non-determiner WH-words that are NOT nsubj
+    # (nsubj was already handled above)
+    if is_in_subordinate_clause(tok, sent):
+        return False
+
+    if tok.dep_ not in {"nsubj", "nsubjpass", "csubj", "attr", "ROOT", "dobj", "pobj", "advmod"}:
+        return False
+
+    for t in sent:
+        if not is_in_subordinate_clause(t, sent) and t.text.lower() in auxiliaries:
+            return True
+
+    return False
+
 def Question(doc):
     """
     Counts the number of sentences containing question words and question marks.
-
-    Args:
-        doc (spacy.tokens.Doc): The spaCy Doc object containing the text to be analyzed.
-
-    Returns:
-        tuple: A tuple containing the counts of Yes/No questions and WH-questions.
     """
-
-    keywords = set([' who ', ' what ', ' where ', ' when ', ' why ', ' how ', ' which '])
-    tags = set(['WRB', 'WP', 'WDT'])
-
-    # doc = nlp(text)
-    sentences = [str(sent) for sent in doc.sents if '?' in str(sent)]
-    all_qs = len(sentences)
-
-    n = 0
-    for i in range(len(sentences)):
-        whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags]
-
-        if len(whq) > 0:
-            n += 1
-
-    return all_qs - n, n
+    search_tags = {'WRB', 'WP', 'WDT', 'WP$'}
+    wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which', 'whose', 'whom'}
+
+    auxiliaries = {
+        'do', 'does', 'did', 'have', 'has', 'had',
+        'can', 'could', 'will', 'would', 
+        'may', 'might', 'shall', 'should',
+        'is', 'are', 'was', 'were', 'am'
+    }
+    pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it', 'these', 'those', 'this', 'that'}
+
+    wh_count = 0
+    yesno_count = 0
+    counted_sentences = set()
+
+    for sent in doc.sents:
+        sent_text = sent.text.strip()
+        sent_tokens = list(sent)
+        if not sent_tokens:
+            continue
+
+        # Method 1: Sentences ending with '?'
+        if sent_text.endswith('?'):
+            wh = False
+            for tok1 in sent_tokens:
+                t1_lower = tok1.text.lower()
+                if t1_lower in wh_words and tok1.tag_ in search_tags:
+                    if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=True):
+                        wh = True
+                        break
+            if wh:
+                wh_count += 1
+            else:
+                yesno_count += 1
+            counted_sentences.add(sent.start)
+            continue
+
+        # Method 2: Lexical rule-based detection for sentences without '?'
+        found_question = False
+        for tok1 in sent_tokens:
+            t1_lower = tok1.text.lower()
+            if t1_lower in wh_words and tok1.tag_ in search_tags:
+                if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=False):
+                    wh_count += 1
+                    counted_sentences.add(sent.start)
+                    found_question = True
+                    break
+
+        if found_question:
+            continue
+
+        # Check for Yes/No questions
+        for tok1, tok2 in zip(sent_tokens, sent_tokens[1:] + [None]):
+            t1_lower = tok1.text.lower()
+            t2_lower = tok2.text.lower() if tok2 else None
+
+            if tok1.i - sent.start > 1:
+                continue
+
+            if t1_lower in auxiliaries and t2_lower in pronoun_followers:
+                yesno_count += 1
+                counted_sentences.add(sent.start)
+                break
+
+    return yesno_count, wh_count
 
 
 def word_start(keywords, doc):