From c3b4422cb3b959eba25fd321e6b2382e0ba752ba Mon Sep 17 00:00:00 2001
From: sundy1994 <yuxuanzh@seas.upenn.edu>
Date: Tue, 16 Dec 2025 11:30:38 -0800
Subject: [PATCH 01/10] sync politeness updates

---
 .gitignore                                    |  3 +-
 src/team_comm_tools/features/keywords.py      | 58 +++++++++++-
 .../features/politeness_v2_helper.py          | 92 +++++++++++++++----
 3 files changed, 130 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 524f3aed..aeadf1ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,4 +58,5 @@ node_modules/
 # testing
 /output
 /vector_data
-test.py
\ No newline at end of file
+test.py
+test.ipynb
\ No newline at end of file
diff --git a/src/team_comm_tools/features/keywords.py b/src/team_comm_tools/features/keywords.py
index a073a009..842a7943 100644
--- a/src/team_comm_tools/features/keywords.py
+++ b/src/team_comm_tools/features/keywords.py
@@ -1,3 +1,4 @@
+# reference: https://github.com/bbevis/politenessPy/blob/main/keywords.py
 kw = {
     "spacy_neg_only": {
         "Negative_Emotion": [
@@ -7260,7 +7261,34 @@
             " sorry ",
             " woops ",
             " whoops ",
-            " oops "
+            " oops ",
+            " apology "
+        ],
+        "Third_Person": [
+            " he ",
+            " him ",
+            " his ",
+            " himself ",
+            " she ",
+            " her ",
+            " hers ",
+            " herself ",
+            " they ",
+            " them ",
+            " their ",
+            " theirs ",
+            " themselves "
+        ],
+        "Contrast_Conjunction": [
+            " but ",
+            " however ",
+            " instead ",
+            " although ",
+            " even though ",
+            " despite ",
+            " and yet ",
+            " nevertheless ",
+            " nonetheless "
         ],
         "Ask_Agency": [
             " do me a favor ",
@@ -7365,7 +7393,7 @@
         "Gratitude": [
             " thank ",
             " thanks ",
-            " thank you ",
+            #" thank you ",
             " grateful ",
             " gratitude ",
             " cheers "
@@ -14419,25 +14447,47 @@
             " cock ",
             " crap ",
             " damn ",
+            " dammit ",
+            " damnit ",
             " dick ",
+            " dickhead ",
+            " dick-head ",
             " dumb ",
+            " dumbass ",
+            " dumb-ass ",
+            " dumb ass ",
             " dyke ",
             " fuck ",
+            " fucking ",
+            " fucker ",
             " goddam ",
+            " goddammit ",
+            " goddamed ",
             " hell ",
+            " horshit ",
             " homo ",
+            " jackass ",
+            " jackass ",
+            " motherfucker ",
+            " mother-fucker ",
+            " motherfucking ",
             " nigger ",
+            " nigra ",
             " piss ",
             " prick ",
             " pussy ",
             " queer ",
             " screw ",
             " shit ",
+            " shite ",
+            " shitting ",
             " sob ",
-            " sonofa ",
             " suck ",
             " sucked ",
-            " sucks "
+            " sucks ",
+            " twat ",
+            " wanker ",
+            " whore "
         ],
         "Truth_Intensifier": [
             " really ",
diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py
index 0385acd6..de117395 100644
--- a/src/team_comm_tools/features/politeness_v2_helper.py
+++ b/src/team_comm_tools/features/politeness_v2_helper.py
@@ -218,29 +218,85 @@ def bare_command(doc):
 def Question(doc):
     """
     Counts the number of sentences containing question words and question marks.
-
+    Reference: https://github.com/bbevis/politenessPy/blob/main/strategy_extractor.py
     Args:
         doc (spacy.tokens.Doc): The spaCy Doc object containing the text to be analyzed.
-
     Returns:
         tuple: A tuple containing the counts of Yes/No questions and WH-questions.
     """
-
-    keywords = set([' who ', ' what ', ' where ', ' when ', ' why ', ' how ', ' which '])
-    tags = set(['WRB', 'WP', 'WDT'])
-
-    # doc = nlp(text)
-    sentences = [str(sent) for sent in doc.sents if '?' in str(sent)]
-    all_qs = len(sentences)
-
-    n = 0
-    for i in range(len(sentences)):
-        whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags]
-
-        if len(whq) > 0:
-            n += 1
-
-    return all_qs - n, n
+    # POS tags for WH-words like who/what/where
+    search_tags = {'WRB', 'WP', 'WDT'}
+    # WH-words and common auxiliaries that follow them in real questions
+    wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'}
+    wh_followers = {
+        'what': {'are', 'is', 'do', 'does', 'can', 'should', 'might'},
+        'who': {'is', 'are', 'was', 'can', 'should'},
+        'where': {'is', 'are', 'can', 'should'},
+        'when': {'is', 'are', 'can', 'should'},
+        'why': {'is', 'are', 'do', 'does', 'can', 'might', 'would'},
+        'how': {'is', 'are', 'do', 'does', 'can', 'should', 'would'},
+        'which': {'is', 'are', 'was', 'can', 'should'}
+    }
+    # Auxiliaries that typically initiate Yes/No questions
+    yesno_aux = {'do', 'does', 'did', 'have', 'has', 'had',
+                #  'can', 'could', 'will', 'would', 
+                 'may', 'might', 'shall', 'should',
+                 'is', 'are', 'was', 'were', 'am'}
+    # Pronouns that often follow auxiliaries in Yes/No questions
+    pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it'}
+    # filler_words = {'ok', 'so', 'well', 'like', 'you know', 'i mean', 'actually', 'basically', 'right', 'just', 'uh', 'um', 'oh', 'hmm', 'like'}
+    
+    wh_count = 0
+    yesno_count = 0
+    counted_sentences = set()
+    for sent in doc.sents:
+        sent_text = sent.text.strip()
+        sent_tokens = list(sent)
+        if not sent_tokens:
+            continue
+        # Method 1: Find question sentences by checking for '?' at end
+        if sent_text.endswith('?'):
+            # try to find the first WH-word in the sentence
+            wh = False
+            for token in sent_tokens:
+                if token.text.lower() in wh_words and token.tag_ in search_tags and token.dep_ not in {"relcl", "acl"}\
+                and token.i < sent.root.i:
+                    wh = True
+                    break
+            if wh:
+                wh_count += 1
+            else:
+                # Fallback: no WH in the sentence → treat as Yes/No question
+                yesno_count += 1
+            counted_sentences.add(sent.start)
+            continue
+        # Method 2: For remaining sentences, apply lexical rule-based detection --- Extract tokens and their metadata for fast access
+        for i in range(len(sent_tokens) - 1):
+            tok1 = sent_tokens[i]
+            tok2 = sent_tokens[i + 1]
+            t1_lower = tok1.text.lower()
+            t2_lower = tok2.text.lower()
+            if sent.start in counted_sentences:
+                break  # already counted
+            # WH pattern
+            if t1_lower in wh_words and t2_lower in wh_followers.get(t1_lower, set()):
+                wh_count += 1
+                counted_sentences.add(sent.start)
+                break
+            # Yes/No pattern
+            if t1_lower in yesno_aux and t2_lower in pronoun_followers:
+                yesno_count += 1
+                counted_sentences.add(sent.start)
+                break
+    return yesno_count, wh_count
+    # sentences = [str(sent) for sent in doc.sents if '?' in str(sent)]
+    # all_qs = len(sentences)
+    # n = 0
+    # for i in range(len(sentences)):
+    #     whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags]
+    #     if len(whq) > 0:
+    #         n += 1
+    # return all_qs - n, n
 
 
 def word_start(keywords, doc):

From b388a328ff0e06d7c9f23a0e45f6f9215bdc917b Mon Sep 17 00:00:00 2001
From: sundy1994 <yuxuanzh@seas.upenn.edu>
Date: Tue, 16 Dec 2025 15:13:46 -0800
Subject: [PATCH 02/10] add new WH/yesno questions to yeomans_test dataset

---
 tests/data/cleaned_data/test_chat_level.csv | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/data/cleaned_data/test_chat_level.csv b/tests/data/cleaned_data/test_chat_level.csv
index a463c715..2c2a1f31 100644
--- a/tests/data/cleaned_data/test_chat_level.csv
+++ b/tests/data/cleaned_data/test_chat_level.csv
@@ -1437,11 +1437,17 @@ yeomans_test,yeomans_user_b,I guess they almost complete it.,Hedges_receptivenes
 yeomans_test,yeomans_user_a,"Ma'am, no my lady, do you know Mr. Smith?",Formal_Title_receptiveness_yeomans,2
 yeomans_test,yeomans_user_b,"I agree, this is correct",Agreement_receptiveness_yeomans,2
 yeomans_test,yeomans_user_a,This is for you. Why you don't understand it's for you?,For_You_receptiveness_yeomans,2
-yeomans_test,yeomans_user_b,"Thank you, I'm really grateful",Gratitude_receptiveness_yeomans,3
+yeomans_test,yeomans_user_b,"Thank you, I'm really grateful",Gratitude_receptiveness_yeomans,2
 yeomans_test,yeomans_user_a,We here you. We totally understand,Acknowledgement_receptiveness_yeomans,2
 yeomans_test,yeomans_user_b,"Shit. You dumb asshole, what the hell? Who's that bastard? Suck my dick.",Swearing_receptiveness_yeomans,7
 yeomans_test,yeomans_user_a,"Hey hello good morning, oh actually good evening.",Hello_receptiveness_yeomans,4
 yeomans_test,yeomans_user_b,Are you sure? Is this the guy? Did he lie to you?,YesNo_Questions_receptiveness_yeomans,3
+yeomans_test,yeomans_user_a,"Did you finish the report, which was due today?",YesNo_Questions_receptiveness_yeomans,1
+yeomans_test,yeomans_user_b,"Did you finish the report, which was due today?",WH_Questions_receptiveness_yeomans,0
+yeomans_test,yeomans_user_a,"We can start here. What is the question?",WH_Questions_receptiveness_yeomans,1
+yeomans_test,yeomans_user_b,"Has she met the teacher who helped you last year?",WH_Questions_receptiveness_yeomans,0
+yeomans_test,yeomans_user_a,"Do you know what time it is?",WH_Questions_receptiveness_yeomans,0
+yeomans_test,yeomans_user_b,"Have you read the article that explains why this happens?",WH_Questions_receptiveness_yeomans,0
 yeomans_test,yeomans_user_a,I'm sorry I sincerely apologize.,Apology_receptiveness_yeomans,2
 yeomans_test,yeomans_user_b,Wow! Amazing! Perfect!,Affirmation_receptiveness_yeomans,3
 yeomans_test,yeomans_user_a,I love you. My friend,First_Person_Single_receptiveness_yeomans,2

From 14dc29478c3f4173d067cda6eaca166333247db0 Mon Sep 17 00:00:00 2001
From: sundy1994 <yuxuanzh@seas.upenn.edu>
Date: Thu, 8 Jan 2026 10:25:18 -0800
Subject: [PATCH 03/10] Refactor Yes/No question detection and improve WH
 question handling in the Question function

---
 .../features/politeness_v2_helper.py          | 32 ++++++++-----------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py
index de117395..69dbb3ef 100644
--- a/src/team_comm_tools/features/politeness_v2_helper.py
+++ b/src/team_comm_tools/features/politeness_v2_helper.py
@@ -238,14 +238,15 @@ def Question(doc):
         'which': {'is', 'are', 'was', 'can', 'should'}
     }
     # Auxiliaries that typically initiate Yes/No questions
-    yesno_aux = {'do', 'does', 'did', 'have', 'has', 'had',
-                #  'can', 'could', 'will', 'would', 
-                 'may', 'might', 'shall', 'should',
-                 'is', 'are', 'was', 'were', 'am'}
+    yesno_aux = {
+        'do', 'does', 'did', 'have', 'has', 'had',
+        'can', 'could', 'will', 'would', 
+        'may', 'might', 'shall', 'should',
+        'is', 'are', 'was', 'were', 'am'
+    }
     # Pronouns that often follow auxiliaries in Yes/No questions
     pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it'}
-    # filler_words = {'ok', 'so', 'well', 'like', 'you know', 'i mean', 'actually', 'basically', 'right', 'just', 'uh', 'um', 'oh', 'hmm', 'like'}
-    
+
     wh_count = 0
     yesno_count = 0
     counted_sentences = set()
@@ -278,25 +279,18 @@ def Question(doc):
             t2_lower = tok2.text.lower()
             if sent.start in counted_sentences:
                 break  # already counted
-            # WH pattern
-            if t1_lower in wh_words and t2_lower in wh_followers.get(t1_lower, set()):
-                wh_count += 1
-                counted_sentences.add(sent.start)
-                break
             # Yes/No pattern
             if t1_lower in yesno_aux and t2_lower in pronoun_followers:
                 yesno_count += 1
                 counted_sentences.add(sent.start)
                 break
+            # WH pattern
+            if t1_lower in wh_words and tok1.tag_ in search_tags and tok1.dep_ not in {"relcl", "acl"}\
+                and tok1.i < sent.root.i and t2_lower in wh_followers.get(t1_lower, set()):
+                wh_count += 1
+                counted_sentences.add(sent.start)
+                break
     return yesno_count, wh_count
-    # sentences = [str(sent) for sent in doc.sents if '?' in str(sent)]
-    # all_qs = len(sentences)
-    # n = 0
-    # for i in range(len(sentences)):
-    #     whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags]
-    #     if len(whq) > 0:
-    #         n += 1
-    # return all_qs - n, n
 
 
 def word_start(keywords, doc):

From 125d82172948a8c5b2e5d4e58034b74e31c9ff2c Mon Sep 17 00:00:00 2001
From: Xinlan Emily Hu <xehu@wharton.upenn.edu>
Date: Thu, 8 Jan 2026 16:59:22 -0800
Subject: [PATCH 04/10] add 'am' to WH followers

---
 src/team_comm_tools/features/politeness_v2_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py
index 69dbb3ef..8e83a54d 100644
--- a/src/team_comm_tools/features/politeness_v2_helper.py
+++ b/src/team_comm_tools/features/politeness_v2_helper.py
@@ -230,7 +230,7 @@ def Question(doc):
     wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'}
     wh_followers = {
         'what': {'are', 'is', 'do', 'does', 'can', 'should', 'might'},
-        'who': {'is', 'are', 'was', 'can', 'should'},
+        'who': {'am', 'is', 'are', 'was', 'can', 'should'},
         'where': {'is', 'are', 'can', 'should'},
         'when': {'is', 'are', 'can', 'should'},
         'why': {'is', 'are', 'do', 'does', 'can', 'might', 'would'},

From a6e99a78a21349667626266f86da3a36dc9e8ecc Mon Sep 17 00:00:00 2001
From: Xinlan Emily Hu <xehu@wharton.upenn.edu>
Date: Thu, 8 Jan 2026 17:00:58 -0800
Subject: [PATCH 05/10] remove commented-out thank you

---
 src/team_comm_tools/features/keywords.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/team_comm_tools/features/keywords.py b/src/team_comm_tools/features/keywords.py
index 842a7943..74e6ff63 100644
--- a/src/team_comm_tools/features/keywords.py
+++ b/src/team_comm_tools/features/keywords.py
@@ -7393,7 +7393,6 @@
         "Gratitude": [
             " thank ",
             " thanks ",
-            #" thank you ",
             " grateful ",
             " gratitude ",
             " cheers "

From 183d1e633ac65bd62ee5c28969141527d204047f Mon Sep 17 00:00:00 2001
From: Xinlan Emily Hu <xehu@wharton.upenn.edu>
Date: Thu, 8 Jan 2026 17:09:22 -0800
Subject: [PATCH 06/10] add am everywhere

---
 src/team_comm_tools/features/politeness_v2_helper.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py
index 8e83a54d..d1e5e765 100644
--- a/src/team_comm_tools/features/politeness_v2_helper.py
+++ b/src/team_comm_tools/features/politeness_v2_helper.py
@@ -229,13 +229,13 @@ def Question(doc):
     # WH-words and common auxiliaries that follow them in real questions
     wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'}
     wh_followers = {
-        'what': {'are', 'is', 'do', 'does', 'can', 'should', 'might'},
+        'what': {'am', 'are', 'is', 'do', 'does', 'can', 'should', 'might'},
         'who': {'am', 'is', 'are', 'was', 'can', 'should'},
-        'where': {'is', 'are', 'can', 'should'},
-        'when': {'is', 'are', 'can', 'should'},
-        'why': {'is', 'are', 'do', 'does', 'can', 'might', 'would'},
-        'how': {'is', 'are', 'do', 'does', 'can', 'should', 'would'},
-        'which': {'is', 'are', 'was', 'can', 'should'}
+        'where': {'am', 'is', 'are', 'can', 'should'},
+        'when': {'am', 'is', 'are', 'can', 'should'},
+        'why': {'am', 'is', 'are', 'do', 'does', 'can', 'might', 'would'},
+        'how': {'am', 'is', 'are', 'do', 'does', 'can', 'should', 'would'},
+        'which': {'am', 'is', 'are', 'was', 'can', 'should'}
     }
     # Auxiliaries that typically initiate Yes/No questions
     yesno_aux = {

From bafe275c60c38a70134c5de89fccc98e28db693b Mon Sep 17 00:00:00 2001
From: Xinlan Emily Hu <xehu@wharton.upenn.edu>
Date: Fri, 9 Jan 2026 15:19:35 -0800
Subject: [PATCH 07/10] update to question detection

---
 .../features/politeness_v2_helper.py          | 45 ++++++++++++++-----
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py
index d1e5e765..a671d7af 100644
--- a/src/team_comm_tools/features/politeness_v2_helper.py
+++ b/src/team_comm_tools/features/politeness_v2_helper.py
@@ -229,13 +229,13 @@ def Question(doc):
     # WH-words and common auxiliaries that follow them in real questions
     wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'}
     wh_followers = {
-        'what': {'am', 'are', 'is', 'do', 'does', 'can', 'should', 'might'},
-        'who': {'am', 'is', 'are', 'was', 'can', 'should'},
-        'where': {'am', 'is', 'are', 'can', 'should'},
-        'when': {'am', 'is', 'are', 'can', 'should'},
-        'why': {'am', 'is', 'are', 'do', 'does', 'can', 'might', 'would'},
-        'how': {'am', 'is', 'are', 'do', 'does', 'can', 'should', 'would'},
-        'which': {'am', 'is', 'are', 'was', 'can', 'should'}
+        'what': {'am', 'was', 'were', 'are', 'is', 'do', 'does', 'can', 'should', 'might'},
+        'who': {'am', 'was', 'were', 'is', 'are', 'was', 'can', 'should'},
+        'where': {'am', 'was', 'were', 'is', 'are', 'can', 'should'},
+        'when': {'am', 'was', 'were', 'is', 'are', 'can', 'should'},
+        'why': {'am', 'was', 'were', 'is', 'are', 'do', 'does', 'can', 'might', 'would'},
+        'how': {'am', 'was', 'were', 'is', 'are', 'do', 'does', 'can', 'should', 'would'},
+        'which': {'am', 'was', 'were', 'is', 'are', 'was', 'can', 'should'}
     }
     # Auxiliaries that typically initiate Yes/No questions
     yesno_aux = {
@@ -259,9 +259,24 @@ def Question(doc):
         if sent_text.endswith('?'):
             # try to find the first WH-word in the sentence
             wh = False
-            for token in sent_tokens:
-                if token.text.lower() in wh_words and token.tag_ in search_tags and token.dep_ not in {"relcl", "acl"}\
-                and token.i < sent.root.i:
+            for i in range(len(sent_tokens) - 1):
+                tok1 = sent_tokens[i]
+                tok2 = sent_tokens[i + 1]
+                t1_lower = tok1.text.lower()
+                t2_lower = tok2.text.lower()
+                
+                # Rules for detecting WH-questions:
+                # tok2.dep_ != "relcl": catches relative clauses like "the book which is on the table"
+                # "any(t.text.lower() in wh_followers.get(t1_lower, set())" ... 
+                # catches WH-word and main verb having a noun clause in between (e.g., "which of these options is it")
+                if t1_lower in wh_words and tok1.tag_ in search_tags \
+                    and tok1.dep_ not in {"relcl", "acl"} \
+                    and tok2.dep_ != "relcl" \
+                    and any(
+                        t.text.lower() in wh_followers.get(t1_lower, set())
+                        for t in sent_tokens[i+1:]
+                    ):
+
                     wh = True
                     break
             if wh:
@@ -285,11 +300,17 @@ def Question(doc):
                 counted_sentences.add(sent.start)
                 break
             # WH pattern
-            if t1_lower in wh_words and tok1.tag_ in search_tags and tok1.dep_ not in {"relcl", "acl"}\
-                and tok1.i < sent.root.i and t2_lower in wh_followers.get(t1_lower, set()):
+            if t1_lower in wh_words and tok1.tag_ in search_tags \
+                and tok1.dep_ not in {"relcl", "acl"} \
+                and tok2.dep_ != "relcl" \
+                and any(
+                    t.text.lower() in wh_followers.get(t1_lower, set())
+                    for t in sent_tokens[i+1:]
+                ):
                 wh_count += 1
                 counted_sentences.add(sent.start)
                 break
+
     return yesno_count, wh_count
 
 

From 40aee792b52bff96b816fe0208c12c093b246d2e Mon Sep 17 00:00:00 2001
From: Xinlan Emily Hu <xehu@wharton.upenn.edu>
Date: Fri, 9 Jan 2026 21:43:16 -0800
Subject: [PATCH 08/10] an expanded and more robust version of the question
 function (with a little bit of help from claude)

---
 .../features/politeness_v2_helper.py          | 225 +++++++++++++-----
 1 file changed, 160 insertions(+), 65 deletions(-)

diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py
index a671d7af..634559b6 100644
--- a/src/team_comm_tools/features/politeness_v2_helper.py
+++ b/src/team_comm_tools/features/politeness_v2_helper.py
@@ -215,101 +215,196 @@ def bare_command(doc):
     return len(bc)
 
 
+def is_in_subordinate_clause(tok, sent):
+    """
+    Check if a token is inside a subordinate clause rather than the main clause.
+    """
+    # Walk up from the token's head (not the token itself)
+    current = tok
+    while current.head != current and current != sent.root:
+        # Check if the HEAD has a subordinate clause dependency
+        if current.head.dep_ in {"advcl", "relcl", "acl", "ccomp", "xcomp"} and current.head != sent.root:
+            # We're attached to something that's a subordinate clause
+            return True
+        current = current.head
+    return False
+
+def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False):
+    """
+    Returns True if the WH-word token is part of a real main-clause question.
+    """
+    # For WH-determiners (both with and without ?), use special logic
+    if tok.dep_ == "det":
+        noun = tok.head
+        
+        # Check: is the noun inside a complement clause?
+        current = noun
+        while current.head != current and current != sent.root:
+            if current.dep_ in {"ccomp", "xcomp"}:
+                return False
+            current = current.head
+        
+        # If the noun is a subject (nsubj) and has a relcl ancestor, it's likely a misparsed question
+        if noun.dep_ in {"nsubj", "nsubjpass"}:
+            # This looks like a question with the WH-noun as subject
+            # Check for auxiliary
+            for t in sent:
+                if t.i > noun.i and t.text.lower() in auxiliaries:
+                    return True
+            
+            # If sentence ends with ?, accept it
+            if ends_with_question_mark:
+                return True
+        
+        # For non-subject WH-determiners, check close ancestors for relcl
+        if tok.dep_ == "det" and tok.head.dep_ != "relcl":
+            # Check head and head's head
+            if tok.head.head.dep_ == "relcl" and tok.head.head.i < tok.i:
+                # relcl is before WH-word, likely a real relative clause
+                return False
+        
+        # Check if there's an auxiliary after the WH-word/noun
+        for t in sent:
+            if t.i > noun.i and t.text.lower() in auxiliaries:
+                return True
+        
+        return False
+    
+    # For other WH-words (not determiners)
+    # First check for complement clauses (ccomp, xcomp) - these are embedded questions
+    for anc in tok.ancestors:
+        if anc.dep_ in {"ccomp", "xcomp"}:
+            return False
+    
+    # Check if WH-word is attached to a verb that takes interrogative complements
+    # Verbs like: tell, ask, know, wonder, understand, explain, show, see, remember, etc.
+    complement_taking_verbs = {
+        'tell', 'ask', 'know', 'wonder', 'understand', 'explain', 
+        'show', 'see', 'remember', 'forget', 'realize', 'figure',
+        'decide', 'consider', 'discover', 'find', 'learn', 'teach'
+    }
+    
+    if tok.head.pos_ == "VERB" and tok.head.lemma_ in complement_taking_verbs:
+        # Check if there are tokens before this verb (indicating it's not sentence-initial)
+        tokens_before_verb = 0
+        for t in sent:
+            if t.i >= tok.head.i:
+                break
+            if t.pos_ not in {"PUNCT", "INTJ"}:
+                tokens_before_verb += 1
+        
+        # If there are 2+ tokens before the verb, WH is likely embedded
+        if tokens_before_verb >= 2:
+            return False
+    
+    # Check if has relcl ancestor
+    has_relcl_ancestor = False
+    for anc in tok.ancestors:
+        if anc.dep_ == "relcl":
+            has_relcl_ancestor = True
+            break
+    
+    if has_relcl_ancestor:
+        # Check if this is a misparsed main question vs real relative clause
+        # Count substantive tokens before the WH-word
+        substantive_before = 0
+        for t in sent:
+            if t.i >= tok.i:
+                break
+            if t.pos_ not in {"INTJ", "PUNCT", "CCONJ", "DET"}:
+                substantive_before += 1
+        
+        # If fewer than 3 substantive tokens before WH, likely a misparsed main question
+        if substantive_before < 3:
+            pass  # Don't exclude it
+        else:
+            # Likely a real relative clause
+            return False
+    
+    # If the sentence ends with ?, be lenient for non-relcl WH-words
+    if ends_with_question_mark:
+        return True
+    
+    # For non-? sentences with non-determiner WH-words
+    if is_in_subordinate_clause(tok, sent):
+        return False
+    
+    if tok.dep_ not in {"nsubj", "nsubjpass", "csubj", "attr", "ROOT", "dobj", "pobj", "advmod"}:
+        return False
+
+    for t in sent:
+        if not is_in_subordinate_clause(t, sent) and t.text.lower() in auxiliaries:
+            return True
+
+    return False
+
 def Question(doc):
     """
     Counts the number of sentences containing question words and question marks.
-    Reference: https://github.com/bbevis/politenessPy/blob/main/strategy_extractor.py
-    Args:
-        doc (spacy.tokens.Doc): The spaCy Doc object containing the text to be analyzed.
-    Returns:
-        tuple: A tuple containing the counts of Yes/No questions and WH-questions.
-    """
-    # POS tags for WH-words like who/what/where
-    search_tags = {'WRB', 'WP', 'WDT'}
-    # WH-words and common auxiliaries that follow them in real questions
-    wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'}
-    wh_followers = {
-        'what': {'am', 'was', 'were', 'are', 'is', 'do', 'does', 'can', 'should', 'might'},
-        'who': {'am', 'was', 'were', 'is', 'are', 'was', 'can', 'should'},
-        'where': {'am', 'was', 'were', 'is', 'are', 'can', 'should'},
-        'when': {'am', 'was', 'were', 'is', 'are', 'can', 'should'},
-        'why': {'am', 'was', 'were', 'is', 'are', 'do', 'does', 'can', 'might', 'would'},
-        'how': {'am', 'was', 'were', 'is', 'are', 'do', 'does', 'can', 'should', 'would'},
-        'which': {'am', 'was', 'were', 'is', 'are', 'was', 'can', 'should'}
-    }
-    # Auxiliaries that typically initiate Yes/No questions
-    yesno_aux = {
+    """
+    search_tags = {'WRB', 'WP', 'WDT', 'WP$'}
+    wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which', 'whose', 'whom'}
+
+    auxiliaries = {
         'do', 'does', 'did', 'have', 'has', 'had',
         'can', 'could', 'will', 'would', 
         'may', 'might', 'shall', 'should',
         'is', 'are', 'was', 'were', 'am'
     }
-    # Pronouns that often follow auxiliaries in Yes/No questions
-    pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it'}
+    pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it', 'these', 'those', 'this', 'that'}
 
     wh_count = 0
     yesno_count = 0
     counted_sentences = set()
+    
     for sent in doc.sents:
         sent_text = sent.text.strip()
         sent_tokens = list(sent)
         if not sent_tokens:
             continue
-        # Method 1: Find question sentences by checking for '?' at end
+            
+        # Method 1: Sentences ending with '?'
         if sent_text.endswith('?'):
-            # try to find the first WH-word in the sentence
             wh = False
-            for i in range(len(sent_tokens) - 1):
-                tok1 = sent_tokens[i]
-                tok2 = sent_tokens[i + 1]
+            for tok1 in sent_tokens:
                 t1_lower = tok1.text.lower()
-                t2_lower = tok2.text.lower()
-                
-                # Rules for detecting WH-questions:
-                # tok2.dep_ != "relcl": catches relative clauses like "the book which is on the table"
-                # "any(t.text.lower() in wh_followers.get(t1_lower, set())" ... 
-                # catches WH-word and main verb having a noun clause in between (e.g., "which of these options is it")
-                if t1_lower in wh_words and tok1.tag_ in search_tags \
-                    and tok1.dep_ not in {"relcl", "acl"} \
-                    and tok2.dep_ != "relcl" \
-                    and any(
-                        t.text.lower() in wh_followers.get(t1_lower, set())
-                        for t in sent_tokens[i+1:]
-                    ):
-
-                    wh = True
-                    break
+                if t1_lower in wh_words and tok1.tag_ in search_tags:
+                    if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=True):
+                        wh = True
+                        break
             if wh:
                 wh_count += 1
             else:
-                # Fallback: no WH in the sentence → treat as Yes/No question
                 yesno_count += 1
             counted_sentences.add(sent.start)
             continue
-        # Method 2: For remaining sentences, apply lexical rule-based detection --- Extract tokens and their metadata for fast access
-        for i in range(len(sent_tokens) - 1):
-            tok1 = sent_tokens[i]
-            tok2 = sent_tokens[i + 1]
+        
+        # Method 2: Lexical rule-based detection for sentences without '?'
+        found_question = False
+        for tok1 in sent_tokens:
+            t1_lower = tok1.text.lower()
+            if t1_lower in wh_words and tok1.tag_ in search_tags:
+                if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=False):
+                    wh_count += 1
+                    counted_sentences.add(sent.start)
+                    found_question = True
+                    break
+        
+        if found_question:
+            continue
+            
+        # Check for Yes/No questions
+        for tok1, tok2 in zip(sent_tokens, sent_tokens[1:] + [None]):
             t1_lower = tok1.text.lower()
-            t2_lower = tok2.text.lower()
-            if sent.start in counted_sentences:
-                break  # already counted
-            # Yes/No pattern
-            if t1_lower in yesno_aux and t2_lower in pronoun_followers:
+            t2_lower = tok2.text.lower() if tok2 else None
+            
+            if tok1.i - sent.start > 1:
+                continue
+                
+            if t1_lower in auxiliaries and t2_lower in pronoun_followers:
                 yesno_count += 1
                 counted_sentences.add(sent.start)
                 break
-            # WH pattern
-            if t1_lower in wh_words and tok1.tag_ in search_tags \
-                and tok1.dep_ not in {"relcl", "acl"} \
-                and tok2.dep_ != "relcl" \
-                and any(
-                    t.text.lower() in wh_followers.get(t1_lower, set())
-                    for t in sent_tokens[i+1:]
-                ):
-                wh_count += 1
-                counted_sentences.add(sent.start)
-                break
 
     return yesno_count, wh_count
 

From 8e1593498420be521b7bb2a08baaf6abdd53398a Mon Sep 17 00:00:00 2001
From: Xinlan Emily Hu <xehu@wharton.upenn.edu>
Date: Fri, 9 Jan 2026 21:57:13 -0800
Subject: [PATCH 09/10] fixing more edge cases

---
 src/team_comm_tools/features/politeness_v2_helper.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py
index 634559b6..d3b63c0c 100644
--- a/src/team_comm_tools/features/politeness_v2_helper.py
+++ b/src/team_comm_tools/features/politeness_v2_helper.py
@@ -233,6 +233,15 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False):
     """
     Returns True if the WH-word token is part of a real main-clause question.
     """
+    # Check if sentence starts with auxiliary (not WH-word) - indicates Yes/No question
+    sent_tokens = list(sent)
+    if len(sent_tokens) >= 1:
+        first_tok = sent_tokens[0]
+        if first_tok.text.lower() in auxiliaries and first_tok.text.lower() not in {'what', 'who', 'where', 'when', 'why', 'how', 'which'}:
+            # Sentence starts with auxiliary - this is a Yes/No question
+            # Any WH-words are being used as content, not interrogatives
+            return False
+    
     # For WH-determiners (both with and without ?), use special logic
     if tok.dep_ == "det":
         noun = tok.head
@@ -277,7 +286,6 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False):
             return False
     
     # Check if WH-word is attached to a verb that takes interrogative complements
-    # Verbs like: tell, ask, know, wonder, understand, explain, show, see, remember, etc.
     complement_taking_verbs = {
         'tell', 'ask', 'know', 'wonder', 'understand', 'explain', 
         'show', 'see', 'remember', 'forget', 'realize', 'figure',

From a01178156c86968e7cd341ccefee484f58f70658 Mon Sep 17 00:00:00 2001
From: Xinlan Emily Hu <xehu@wharton.upenn.edu>
Date: Sat, 10 Jan 2026 17:51:05 -0800
Subject: [PATCH 10/10] update robustness of question detector and update tests

---
 .../features/politeness_v2_helper.py          | 63 ++++++++++++++++---
 tests/data/cleaned_data/test_chat_level.csv   | 30 ++++++++-
 2 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py
index d3b63c0c..17a67602 100644
--- a/src/team_comm_tools/features/politeness_v2_helper.py
+++ b/src/team_comm_tools/features/politeness_v2_helper.py
@@ -238,8 +238,6 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False):
     if len(sent_tokens) >= 1:
         first_tok = sent_tokens[0]
         if first_tok.text.lower() in auxiliaries and first_tok.text.lower() not in {'what', 'who', 'where', 'when', 'why', 'how', 'which'}:
-            # Sentence starts with auxiliary - this is a Yes/No question
-            # Any WH-words are being used as content, not interrogatives
             return False
     
     # For WH-determiners (both with and without ?), use special logic
@@ -280,10 +278,58 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False):
         return False
     
     # For other WH-words (not determiners)
-    # First check for complement clauses (ccomp, xcomp) - these are embedded questions
-    for anc in tok.ancestors:
-        if anc.dep_ in {"ccomp", "xcomp"}:
+    
+    # Special handling for WH-words that are subjects (nsubj, nsubjpass)
+    if tok.dep_ in {"nsubj", "nsubjpass"}:
+        # First check: is this part of a relative clause?
+        has_relcl_ancestor = False
+        for anc in tok.ancestors:
+            if anc.dep_ == "relcl":
+                has_relcl_ancestor = True
+                break
+        
+        if has_relcl_ancestor:
+            # Check if there's a noun before the WH-word (ignoring punctuation)
+            # This is the typical relative clause pattern: "the book, which..."
+            has_noun_before = False
+            for t in sent:
+                if t.i >= tok.i:
+                    break
+                if t.pos_ in {"NOUN", "PROPN"}:
+                    has_noun_before = True
+            
+            # If there's a noun before WH and it has relcl ancestor, it's a real relative clause
+            if has_noun_before:
+                return False
+        
+        # Check if there's actually a complement-taking verb before the WH-word
+        complement_taking_verbs = {
+            'tell', 'ask', 'know', 'wonder', 'understand', 'explain', 
+            'show', 'see', 'remember', 'forget', 'realize', 'figure',
+            'decide', 'consider', 'discover', 'find', 'learn', 'teach'
+        }
+        
+        has_complement_verb_before = False
+        for t in sent:
+            if t.i >= tok.i:
+                break
+            if t.pos_ == "VERB" and t.lemma_ in complement_taking_verbs:
+                has_complement_verb_before = True
+                break
+        
+        # If no complement-taking verb before WH-word, it's a main question
+        if not has_complement_verb_before:
+            # Check for an auxiliary after the WH-word
+            for t in sent:
+                if t.i > tok.i and t.text.lower() in auxiliaries:
+                    return True
             return False
+        # If there IS a complement verb, fall through to normal checks
+        
+        # First check for complement clauses (ccomp, xcomp) - these are embedded questions
+        for anc in tok.ancestors:
+            if anc.dep_ in {"ccomp", "xcomp"}:
+                return False
     
     # Check if WH-word is attached to a verb that takes interrogative complements
     complement_taking_verbs = {
@@ -333,7 +379,8 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False):
     if ends_with_question_mark:
         return True
     
-    # For non-? sentences with non-determiner WH-words
+    # For non-? sentences with non-determiner WH-words that are NOT nsubj
+    # (nsubj was already handled above)
     if is_in_subordinate_clause(tok, sent):
         return False
     
@@ -370,7 +417,7 @@ def Question(doc):
         sent_tokens = list(sent)
         if not sent_tokens:
             continue
-            
+        
         # Method 1: Sentences ending with '?'
         if sent_text.endswith('?'):
             wh = False
@@ -397,7 +444,7 @@ def Question(doc):
                     counted_sentences.add(sent.start)
                     found_question = True
                     break
-        
+
         if found_question:
             continue
             
diff --git a/tests/data/cleaned_data/test_chat_level.csv b/tests/data/cleaned_data/test_chat_level.csv
index 2c2a1f31..09fe1887 100644
--- a/tests/data/cleaned_data/test_chat_level.csv
+++ b/tests/data/cleaned_data/test_chat_level.csv
@@ -88,14 +88,38 @@ I respond to that too",num_block_quote_responses,2
 1,A,hello,Hello_receptiveness_yeomans,1
 1,B,So how should we answer this,Token_count_receptiveness_yeomans,6
 1,A,We can start here. What is the question?,YesNo_Questions_receptiveness_yeomans,0
-1,B,I am not sure. Where is the rest of our team?,WH_Questions_receptiveness_yeomans,1
 1,B,"Please help me figure this out, I really want to do well on this please",Please_receptiveness_yeomans,2
+1,B,I am not sure. Where is the rest of our team?,WH_Questions_receptiveness_yeomans,1
+1,B,I am not sure. Where is the rest of our team?,First_Person_Single_receptiveness_yeomans,1
+1,B,"Well please help me figure this out, I really want to do well on this please okay",factuality_politeness_convokit,1
+1B,A,is where different from why?,YesNo_Questions_receptiveness_yeomans,1
+1B,B,is where different from why?,WH_Questions_receptiveness_yeomans,0
+1B,A,could where be different from why?,YesNo_Questions_receptiveness_yeomans,1
+1B,B,could where be different from why?,WH_Questions_receptiveness_yeomans,0
+1B,A,ok so I don't really understand which one it is,YesNo_Questions_receptiveness_yeomans,0
+1B,B,ok so I don't really understand which one it is,WH_Questions_receptiveness_yeomans,0
+1B,A,"the book, which is on the table, is red",YesNo_Questions_receptiveness_yeomans,0
+1B,B,"the book, which is on the table, is red",WH_Questions_receptiveness_yeomans,0
+1B,A,"The love which we had long sought for has finally arrived.",YesNo_Questions_receptiveness_yeomans,0
+1B,B,"The love which we had long sought for has finally arrived.",WH_Questions_receptiveness_yeomans,0
+1B,A,alrighty so which was your favorite,YesNo_Questions_receptiveness_yeomans,0
+1B,B,alrighty so which was your favorite,WH_Questions_receptiveness_yeomans,1
+1B,A,"after trying all the cake, which ones were your favorites",YesNo_Questions_receptiveness_yeomans,0
+1B,B,"after trying all the cake, which ones were your favorites",WH_Questions_receptiveness_yeomans,1
+1B,A,are these the books which you were looking for,YesNo_Questions_receptiveness_yeomans,1
+1B,B,are these the books which you were looking for,WH_Questions_receptiveness_yeomans,0
+1B,A,are these the books which you were looking for?,YesNo_Questions_receptiveness_yeomans,1
+1B,B,are these the books which you were looking for?,WH_Questions_receptiveness_yeomans,0
+1B,A,"the book, which is on the table, is red?",YesNo_Questions_receptiveness_yeomans,1
+1B,B,"the book, which is on the table, is red?",WH_Questions_receptiveness_yeomans,0
+1B,A,can you tell me who it is? what is it? where?,YesNo_Questions_receptiveness_yeomans,1
+1B,B,can you tell me who it is? what is it? where?,WH_Questions_receptiveness_yeomans,2
+1B,A,with whom did Napoleon fight his first war?,YesNo_Questions_receptiveness_yeomans,0
+1B,B,with whom did Napoleon fight his first war?,WH_Questions_receptiveness_yeomans,1
 2,C,Hey,Hello_receptiveness_yeomans,1
 2,C,Okay bro lets split it 50/50,Impersonal_Pronoun_receptiveness_yeomans,1
 2,D,Maybe but how about 60/40? I doubt its fair otherwise,Hedges_receptiveness_yeomans,2
 2,C,Seems fair,Hedges_receptiveness_yeomans,1
-1,B,I am not sure. Where is the rest of our team?,First_Person_Single_receptiveness_yeomans,1
-1,B,"Well please help me figure this out, I really want to do well on this please okay",factuality_politeness_convokit,1
 2,C,Seems possible,hashedge_politeness_convokit,1
 2,E,I see what youre thinking but I disagree,Acknowledgement_receptiveness_yeomans,1
 2,E,We get only one chance so we should understand how to split it,Acknowledgement_receptiveness_yeomans,2