From c3b4422cb3b959eba25fd321e6b2382e0ba752ba Mon Sep 17 00:00:00 2001 From: sundy1994 Date: Tue, 16 Dec 2025 11:30:38 -0800 Subject: [PATCH 01/10] sync politeness updates --- .gitignore | 3 +- src/team_comm_tools/features/keywords.py | 58 +++++++++++- .../features/politeness_v2_helper.py | 92 +++++++++++++++---- 3 files changed, 130 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 524f3aed..aeadf1ab 100644 --- a/.gitignore +++ b/.gitignore @@ -58,4 +58,5 @@ node_modules/ # testing /output /vector_data -test.py \ No newline at end of file +test.py +test.ipynb \ No newline at end of file diff --git a/src/team_comm_tools/features/keywords.py b/src/team_comm_tools/features/keywords.py index a073a009..842a7943 100644 --- a/src/team_comm_tools/features/keywords.py +++ b/src/team_comm_tools/features/keywords.py @@ -1,3 +1,4 @@ +# reference: https://github.com/bbevis/politenessPy/blob/main/keywords.py kw = { "spacy_neg_only": { "Negative_Emotion": [ @@ -7260,7 +7261,34 @@ " sorry ", " woops ", " whoops ", - " oops " + " oops ", + " apology " + ], + "Third_Person": [ + " he ", + " him ", + " his ", + " himself ", + " she ", + " her ", + " hers ", + " herself ", + " they ", + " them ", + " their ", + " theirs ", + " themselves " + ], + "Contrast_Conjunction": [ + " but ", + " however ", + " instead ", + " although ", + " even though ", + " despite ", + " and yet ", + " nevertheless ", + " nonetheless " ], "Ask_Agency": [ " do me a favor ", @@ -7365,7 +7393,7 @@ "Gratitude": [ " thank ", " thanks ", - " thank you ", + #" thank you ", " grateful ", " gratitude ", " cheers " @@ -14419,25 +14447,47 @@ " cock ", " crap ", " damn ", + " dammit ", + " damnit ", " dick ", + " dickhead ", + " dick-head ", " dumb ", + " dumbass ", + " dumb-ass ", + " dumb ass ", " dyke ", " fuck ", + " fucking ", + " fucker ", " goddam ", + " goddammit ", + " goddamed ", " hell ", + " horshit ", " homo ", + " jackass ", + " jackass ", + " motherfucker ", + " mother-fucker ", + " motherfucking ", " nigger ", + " nigra ", " piss ", " prick ", " pussy ", " queer ", " screw ", " shit ", + " shite ", + " shitting ", " sob ", - " sonofa ", " suck ", " sucked ", - " sucks " + " sucks ", + " twat ", + " wanker ", + " whore " ], "Truth_Intensifier": [ " really ", diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py index 0385acd6..de117395 100644 --- a/src/team_comm_tools/features/politeness_v2_helper.py +++ b/src/team_comm_tools/features/politeness_v2_helper.py @@ -218,29 +218,85 @@ def bare_command(doc): def Question(doc): """ Counts the number of sentences containing question words and question marks. - + Reference: https://github.com/bbevis/politenessPy/blob/main/strategy_extractor.py Args: doc (spacy.tokens.Doc): The spaCy Doc object containing the text to be analyzed. - Returns: tuple: A tuple containing the counts of Yes/No questions and WH-questions. """ - - keywords = set([' who ', ' what ', ' where ', ' when ', ' why ', ' how ', ' which ']) - tags = set(['WRB', 'WP', 'WDT']) - - # doc = nlp(text) - sentences = [str(sent) for sent in doc.sents if '?' in str(sent)] - all_qs = len(sentences) - - n = 0 - for i in range(len(sentences)): - whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags] - - if len(whq) > 0: - n += 1 - - return all_qs - n, n + # POS tags for WH-words like who/what/where + search_tags = {'WRB', 'WP', 'WDT'} + # WH-words and common auxiliaries that follow them in real questions + wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'} + wh_followers = { + 'what': {'are', 'is', 'do', 'does', 'can', 'should', 'might'}, + 'who': {'is', 'are', 'was', 'can', 'should'}, + 'where': {'is', 'are', 'can', 'should'}, + 'when': {'is', 'are', 'can', 'should'}, + 'why': {'is', 'are', 'do', 'does', 'can', 'might', 'would'}, + 'how': {'is', 'are', 'do', 'does', 'can', 'should', 'would'}, + 'which': {'is', 'are', 'was', 'can', 'should'} + } + # Auxiliaries that typically initiate Yes/No questions + yesno_aux = {'do', 'does', 'did', 'have', 'has', 'had', + # 'can', 'could', 'will', 'would', + 'may', 'might', 'shall', 'should', + 'is', 'are', 'was', 'were', 'am'} + # Pronouns that often follow auxiliaries in Yes/No questions + pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it'} + # filler_words = {'ok', 'so', 'well', 'like', 'you know', 'i mean', 'actually', 'basically', 'right', 'just', 'uh', 'um', 'oh', 'hmm', 'like'} + + wh_count = 0 + yesno_count = 0 + counted_sentences = set() + for sent in doc.sents: + sent_text = sent.text.strip() + sent_tokens = list(sent) + if not sent_tokens: + continue + # Method 1: Find question sentences by checking for '?' at end + if sent_text.endswith('?'): + # try to find the first WH-word in the sentence + wh = False + for token in sent_tokens: + if token.text.lower() in wh_words and token.tag_ in search_tags and token.dep_ not in {"relcl", "acl"}\ + and token.i < sent.root.i: + wh = True + break + if wh: + wh_count += 1 + else: + # Fallback: no WH in the sentence → treat as Yes/No question + yesno_count += 1 + counted_sentences.add(sent.start) + continue + # Method 2: For remaining sentences, apply lexical rule-based detection --- Extract tokens and their metadata for fast access + for i in range(len(sent_tokens) - 1): + tok1 = sent_tokens[i] + tok2 = sent_tokens[i + 1] + t1_lower = tok1.text.lower() + t2_lower = tok2.text.lower() + if sent.start in counted_sentences: + break # already counted + # WH pattern + if t1_lower in wh_words and t2_lower in wh_followers.get(t1_lower, set()): + wh_count += 1 + counted_sentences.add(sent.start) + break + # Yes/No pattern + if t1_lower in yesno_aux and t2_lower in pronoun_followers: + yesno_count += 1 + counted_sentences.add(sent.start) + break + return yesno_count, wh_count + # sentences = [str(sent) for sent in doc.sents if '?' in str(sent)] + # all_qs = len(sentences) + # n = 0 + # for i in range(len(sentences)): + # whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags] + # if len(whq) > 0: + # n += 1 + # return all_qs - n, n def word_start(keywords, doc): From b388a328ff0e06d7c9f23a0e45f6f9215bdc917b Mon Sep 17 00:00:00 2001 From: sundy1994 Date: Tue, 16 Dec 2025 15:13:46 -0800 Subject: [PATCH 02/10] add new WH/yesno questions to yeomans_test dataset --- tests/data/cleaned_data/test_chat_level.csv | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/data/cleaned_data/test_chat_level.csv b/tests/data/cleaned_data/test_chat_level.csv index a463c715..2c2a1f31 100644 --- a/tests/data/cleaned_data/test_chat_level.csv +++ b/tests/data/cleaned_data/test_chat_level.csv @@ -1437,11 +1437,17 @@ yeomans_test,yeomans_user_b,I guess they almost complete it.,Hedges_receptivenes yeomans_test,yeomans_user_a,"Ma'am, no my lady, do you know Mr. Smith?",Formal_Title_receptiveness_yeomans,2 yeomans_test,yeomans_user_b,"I agree, this is correct",Agreement_receptiveness_yeomans,2 yeomans_test,yeomans_user_a,This is for you. Why you don't understand it's for you?,For_You_receptiveness_yeomans,2 -yeomans_test,yeomans_user_b,"Thank you, I'm really grateful",Gratitude_receptiveness_yeomans,3 +yeomans_test,yeomans_user_b,"Thank you, I'm really grateful",Gratitude_receptiveness_yeomans,2 yeomans_test,yeomans_user_a,We here you. We totally understand,Acknowledgement_receptiveness_yeomans,2 yeomans_test,yeomans_user_b,"Shit. You dumb asshole, what the hell? Who's that bastard? Suck my dick.",Swearing_receptiveness_yeomans,7 yeomans_test,yeomans_user_a,"Hey hello good morning, oh actually good evening.",Hello_receptiveness_yeomans,4 yeomans_test,yeomans_user_b,Are you sure? Is this the guy? Did he lie to you?,YesNo_Questions_receptiveness_yeomans,3 +yeomans_test,yeomans_user_a,"Did you finish the report, which was due today?",YesNo_Questions_receptiveness_yeomans,1 +yeomans_test,yeomans_user_b,"Did you finish the report, which was due today?",WH_Questions_receptiveness_yeomans,0 +yeomans_test,yeomans_user_a,"We can start here. What is the question?",WH_Questions_receptiveness_yeomans,1 +yeomans_test,yeomans_user_b,"Has she met the teacher who helped you last year?",WH_Questions_receptiveness_yeomans,0 +yeomans_test,yeomans_user_a,"Do you know what time it is?",WH_Questions_receptiveness_yeomans,0 +yeomans_test,yeomans_user_b,"Have you read the article that explains why this happens?",WH_Questions_receptiveness_yeomans,0 yeomans_test,yeomans_user_a,I'm sorry I sincerely apologize.,Apology_receptiveness_yeomans,2 yeomans_test,yeomans_user_b,Wow! Amazing! Perfect!,Affirmation_receptiveness_yeomans,3 yeomans_test,yeomans_user_a,I love you. My friend,First_Person_Single_receptiveness_yeomans,2 From 14dc29478c3f4173d067cda6eaca166333247db0 Mon Sep 17 00:00:00 2001 From: sundy1994 Date: Thu, 8 Jan 2026 10:25:18 -0800 Subject: [PATCH 03/10] Refactor Yes/No question detection and improve WH question handling in the Question function --- .../features/politeness_v2_helper.py | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py index de117395..69dbb3ef 100644 --- a/src/team_comm_tools/features/politeness_v2_helper.py +++ b/src/team_comm_tools/features/politeness_v2_helper.py @@ -238,14 +238,15 @@ def Question(doc): 'which': {'is', 'are', 'was', 'can', 'should'} } # Auxiliaries that typically initiate Yes/No questions - yesno_aux = {'do', 'does', 'did', 'have', 'has', 'had', - # 'can', 'could', 'will', 'would', - 'may', 'might', 'shall', 'should', - 'is', 'are', 'was', 'were', 'am'} + yesno_aux = { + 'do', 'does', 'did', 'have', 'has', 'had', + 'can', 'could', 'will', 'would', + 'may', 'might', 'shall', 'should', + 'is', 'are', 'was', 'were', 'am' + } # Pronouns that often follow auxiliaries in Yes/No questions pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it'} - # filler_words = {'ok', 'so', 'well', 'like', 'you know', 'i mean', 'actually', 'basically', 'right', 'just', 'uh', 'um', 'oh', 'hmm', 'like'} - + wh_count = 0 yesno_count = 0 counted_sentences = set() @@ -278,25 +279,18 @@ def Question(doc): t2_lower = tok2.text.lower() if sent.start in counted_sentences: break # already counted - # WH pattern - if t1_lower in wh_words and t2_lower in wh_followers.get(t1_lower, set()): - wh_count += 1 - counted_sentences.add(sent.start) - break # Yes/No pattern if t1_lower in yesno_aux and t2_lower in pronoun_followers: yesno_count += 1 counted_sentences.add(sent.start) break + # WH pattern + if t1_lower in wh_words and tok1.tag_ in search_tags and tok1.dep_ not in {"relcl", "acl"}\ + and tok1.i < sent.root.i and t2_lower in wh_followers.get(t1_lower, set()): + wh_count += 1 + counted_sentences.add(sent.start) + break return yesno_count, wh_count - # sentences = [str(sent) for sent in doc.sents if '?' in str(sent)] - # all_qs = len(sentences) - # n = 0 - # for i in range(len(sentences)): - # whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags] - # if len(whq) > 0: - # n += 1 - # return all_qs - n, n def word_start(keywords, doc): From 125d82172948a8c5b2e5d4e58034b74e31c9ff2c Mon Sep 17 00:00:00 2001 From: Xinlan Emily Hu Date: Thu, 8 Jan 2026 16:59:22 -0800 Subject: [PATCH 04/10] add 'am' to WH followers --- src/team_comm_tools/features/politeness_v2_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py index 69dbb3ef..8e83a54d 100644 --- a/src/team_comm_tools/features/politeness_v2_helper.py +++ b/src/team_comm_tools/features/politeness_v2_helper.py @@ -230,7 +230,7 @@ def Question(doc): wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'} wh_followers = { 'what': {'are', 'is', 'do', 'does', 'can', 'should', 'might'}, - 'who': {'is', 'are', 'was', 'can', 'should'}, + 'who': {'am', 'is', 'are', 'was', 'can', 'should'}, 'where': {'is', 'are', 'can', 'should'}, 'when': {'is', 'are', 'can', 'should'}, 'why': {'is', 'are', 'do', 'does', 'can', 'might', 'would'}, From a6e99a78a21349667626266f86da3a36dc9e8ecc Mon Sep 17 00:00:00 2001 From: Xinlan Emily Hu Date: Thu, 8 Jan 2026 17:00:58 -0800 Subject: [PATCH 05/10] remove commented-out thank you --- src/team_comm_tools/features/keywords.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/team_comm_tools/features/keywords.py b/src/team_comm_tools/features/keywords.py index 842a7943..74e6ff63 100644 --- a/src/team_comm_tools/features/keywords.py +++ b/src/team_comm_tools/features/keywords.py @@ -7393,7 +7393,6 @@ "Gratitude": [ " thank ", " thanks ", - #" thank you ", " grateful ", " gratitude ", " cheers " From 183d1e633ac65bd62ee5c28969141527d204047f Mon Sep 17 00:00:00 2001 From: Xinlan Emily Hu Date: Thu, 8 Jan 2026 17:09:22 -0800 Subject: [PATCH 06/10] add am everywhere --- src/team_comm_tools/features/politeness_v2_helper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py index 8e83a54d..d1e5e765 100644 --- a/src/team_comm_tools/features/politeness_v2_helper.py +++ b/src/team_comm_tools/features/politeness_v2_helper.py @@ -229,13 +229,13 @@ def Question(doc): # WH-words and common auxiliaries that follow them in real questions wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'} wh_followers = { - 'what': {'are', 'is', 'do', 'does', 'can', 'should', 'might'}, + 'what': {'am', 'are', 'is', 'do', 'does', 'can', 'should', 'might'}, 'who': {'am', 'is', 'are', 'was', 'can', 'should'}, - 'where': {'is', 'are', 'can', 'should'}, - 'when': {'is', 'are', 'can', 'should'}, - 'why': {'is', 'are', 'do', 'does', 'can', 'might', 'would'}, - 'how': {'is', 'are', 'do', 'does', 'can', 'should', 'would'}, - 'which': {'is', 'are', 'was', 'can', 'should'} + 'where': {'am', 'is', 'are', 'can', 'should'}, + 'when': {'am', 'is', 'are', 'can', 'should'}, + 'why': {'am', 'is', 'are', 'do', 'does', 'can', 'might', 'would'}, + 'how': {'am', 'is', 'are', 'do', 'does', 'can', 'should', 'would'}, + 'which': {'am', 'is', 'are', 'was', 'can', 'should'} } # Auxiliaries that typically initiate Yes/No questions yesno_aux = { From bafe275c60c38a70134c5de89fccc98e28db693b Mon Sep 17 00:00:00 2001 From: Xinlan Emily Hu Date: Fri, 9 Jan 2026 15:19:35 -0800 Subject: [PATCH 07/10] update to question detection --- .../features/politeness_v2_helper.py | 45 ++++++++++++++----- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py index d1e5e765..a671d7af 100644 --- a/src/team_comm_tools/features/politeness_v2_helper.py +++ b/src/team_comm_tools/features/politeness_v2_helper.py @@ -229,13 +229,13 @@ def Question(doc): # WH-words and common auxiliaries that follow them in real questions wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'} wh_followers = { - 'what': {'am', 'are', 'is', 'do', 'does', 'can', 'should', 'might'}, - 'who': {'am', 'is', 'are', 'was', 'can', 'should'}, - 'where': {'am', 'is', 'are', 'can', 'should'}, - 'when': {'am', 'is', 'are', 'can', 'should'}, - 'why': {'am', 'is', 'are', 'do', 'does', 'can', 'might', 'would'}, - 'how': {'am', 'is', 'are', 'do', 'does', 'can', 'should', 'would'}, - 'which': {'am', 'is', 'are', 'was', 'can', 'should'} + 'what': {'am', 'was', 'were', 'are', 'is', 'do', 'does', 'can', 'should', 'might'}, + 'who': {'am', 'was', 'were', 'is', 'are', 'was', 'can', 'should'}, + 'where': {'am', 'was', 'were', 'is', 'are', 'can', 'should'}, + 'when': {'am', 'was', 'were', 'is', 'are', 'can', 'should'}, + 'why': {'am', 'was', 'were', 'is', 'are', 'do', 'does', 'can', 'might', 'would'}, + 'how': {'am', 'was', 'were', 'is', 'are', 'do', 'does', 'can', 'should', 'would'}, + 'which': {'am', 'was', 'were', 'is', 'are', 'was', 'can', 'should'} } # Auxiliaries that typically initiate Yes/No questions yesno_aux = { @@ -259,9 +259,24 @@ def Question(doc): if sent_text.endswith('?'): # try to find the first WH-word in the sentence wh = False - for token in sent_tokens: - if token.text.lower() in wh_words and token.tag_ in search_tags and token.dep_ not in {"relcl", "acl"}\ - and token.i < sent.root.i: + for i in range(len(sent_tokens) - 1): + tok1 = sent_tokens[i] + tok2 = sent_tokens[i + 1] + t1_lower = tok1.text.lower() + t2_lower = tok2.text.lower() + + # Rules for detecting WH-questions: + # tok2.dep_ != "relcl": catches relative clauses like "the book which is on the table" + # "any(t.text.lower() in wh_followers.get(t1_lower, set())" ... + # catches WH-word and main verb having a noun clause in between (e.g., "which of these options is it") + if t1_lower in wh_words and tok1.tag_ in search_tags \ + and tok1.dep_ not in {"relcl", "acl"} \ + and tok2.dep_ != "relcl" \ + and any( + t.text.lower() in wh_followers.get(t1_lower, set()) + for t in sent_tokens[i+1:] + ): + wh = True break if wh: @@ -285,11 +300,17 @@ def Question(doc): counted_sentences.add(sent.start) break # WH pattern - if t1_lower in wh_words and tok1.tag_ in search_tags and tok1.dep_ not in {"relcl", "acl"}\ - and tok1.i < sent.root.i and t2_lower in wh_followers.get(t1_lower, set()): + if t1_lower in wh_words and tok1.tag_ in search_tags \ + and tok1.dep_ not in {"relcl", "acl"} \ + and tok2.dep_ != "relcl" \ + and any( + t.text.lower() in wh_followers.get(t1_lower, set()) + for t in sent_tokens[i+1:] + ): wh_count += 1 counted_sentences.add(sent.start) break + return yesno_count, wh_count From 40aee792b52bff96b816fe0208c12c093b246d2e Mon Sep 17 00:00:00 2001 From: Xinlan Emily Hu Date: Fri, 9 Jan 2026 21:43:16 -0800 Subject: [PATCH 08/10] an expanded and more robust version of the question function (with a little bit of help from claude) --- .../features/politeness_v2_helper.py | 225 +++++++++++++----- 1 file changed, 160 insertions(+), 65 deletions(-) diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py index a671d7af..634559b6 100644 --- a/src/team_comm_tools/features/politeness_v2_helper.py +++ b/src/team_comm_tools/features/politeness_v2_helper.py @@ -215,101 +215,196 @@ def bare_command(doc): return len(bc) +def is_in_subordinate_clause(tok, sent): + """ + Check if a token is inside a subordinate clause rather than the main clause. + """ + # Walk up from the token's head (not the token itself) + current = tok + while current.head != current and current != sent.root: + # Check if the HEAD has a subordinate clause dependency + if current.head.dep_ in {"advcl", "relcl", "acl", "ccomp", "xcomp"} and current.head != sent.root: + # We're attached to something that's a subordinate clause + return True + current = current.head + return False + +def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False): + """ + Returns True if the WH-word token is part of a real main-clause question. + """ + # For WH-determiners (both with and without ?), use special logic + if tok.dep_ == "det": + noun = tok.head + + # Check: is the noun inside a complement clause? + current = noun + while current.head != current and current != sent.root: + if current.dep_ in {"ccomp", "xcomp"}: + return False + current = current.head + + # If the noun is a subject (nsubj) and has a relcl ancestor, it's likely a misparsed question + if noun.dep_ in {"nsubj", "nsubjpass"}: + # This looks like a question with the WH-noun as subject + # Check for auxiliary + for t in sent: + if t.i > noun.i and t.text.lower() in auxiliaries: + return True + + # If sentence ends with ?, accept it + if ends_with_question_mark: + return True + + # For non-subject WH-determiners, check close ancestors for relcl + if tok.dep_ == "det" and tok.head.dep_ != "relcl": + # Check head and head's head + if tok.head.head.dep_ == "relcl" and tok.head.head.i < tok.i: + # relcl is before WH-word, likely a real relative clause + return False + + # Check if there's an auxiliary after the WH-word/noun + for t in sent: + if t.i > noun.i and t.text.lower() in auxiliaries: + return True + + return False + + # For other WH-words (not determiners) + # First check for complement clauses (ccomp, xcomp) - these are embedded questions + for anc in tok.ancestors: + if anc.dep_ in {"ccomp", "xcomp"}: + return False + + # Check if WH-word is attached to a verb that takes interrogative complements + # Verbs like: tell, ask, know, wonder, understand, explain, show, see, remember, etc. + complement_taking_verbs = { + 'tell', 'ask', 'know', 'wonder', 'understand', 'explain', + 'show', 'see', 'remember', 'forget', 'realize', 'figure', + 'decide', 'consider', 'discover', 'find', 'learn', 'teach' + } + + if tok.head.pos_ == "VERB" and tok.head.lemma_ in complement_taking_verbs: + # Check if there are tokens before this verb (indicating it's not sentence-initial) + tokens_before_verb = 0 + for t in sent: + if t.i >= tok.head.i: + break + if t.pos_ not in {"PUNCT", "INTJ"}: + tokens_before_verb += 1 + + # If there are 2+ tokens before the verb, WH is likely embedded + if tokens_before_verb >= 2: + return False + + # Check if has relcl ancestor + has_relcl_ancestor = False + for anc in tok.ancestors: + if anc.dep_ == "relcl": + has_relcl_ancestor = True + break + + if has_relcl_ancestor: + # Check if this is a misparsed main question vs real relative clause + # Count substantive tokens before the WH-word + substantive_before = 0 + for t in sent: + if t.i >= tok.i: + break + if t.pos_ not in {"INTJ", "PUNCT", "CCONJ", "DET"}: + substantive_before += 1 + + # If fewer than 3 substantive tokens before WH, likely a misparsed main question + if substantive_before < 3: + pass # Don't exclude it + else: + # Likely a real relative clause + return False + + # If the sentence ends with ?, be lenient for non-relcl WH-words + if ends_with_question_mark: + return True + + # For non-? sentences with non-determiner WH-words + if is_in_subordinate_clause(tok, sent): + return False + + if tok.dep_ not in {"nsubj", "nsubjpass", "csubj", "attr", "ROOT", "dobj", "pobj", "advmod"}: + return False + + for t in sent: + if not is_in_subordinate_clause(t, sent) and t.text.lower() in auxiliaries: + return True + + return False + def Question(doc): """ Counts the number of sentences containing question words and question marks. - Reference: https://github.com/bbevis/politenessPy/blob/main/strategy_extractor.py - Args: - doc (spacy.tokens.Doc): The spaCy Doc object containing the text to be analyzed. - Returns: - tuple: A tuple containing the counts of Yes/No questions and WH-questions. - """ - # POS tags for WH-words like who/what/where - search_tags = {'WRB', 'WP', 'WDT'} - # WH-words and common auxiliaries that follow them in real questions - wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which'} - wh_followers = { - 'what': {'am', 'was', 'were', 'are', 'is', 'do', 'does', 'can', 'should', 'might'}, - 'who': {'am', 'was', 'were', 'is', 'are', 'was', 'can', 'should'}, - 'where': {'am', 'was', 'were', 'is', 'are', 'can', 'should'}, - 'when': {'am', 'was', 'were', 'is', 'are', 'can', 'should'}, - 'why': {'am', 'was', 'were', 'is', 'are', 'do', 'does', 'can', 'might', 'would'}, - 'how': {'am', 'was', 'were', 'is', 'are', 'do', 'does', 'can', 'should', 'would'}, - 'which': {'am', 'was', 'were', 'is', 'are', 'was', 'can', 'should'} - } - # Auxiliaries that typically initiate Yes/No questions - yesno_aux = { + """ + search_tags = {'WRB', 'WP', 'WDT', 'WP$'} + wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which', 'whose', 'whom'} + + auxiliaries = { 'do', 'does', 'did', 'have', 'has', 'had', 'can', 'could', 'will', 'would', 'may', 'might', 'shall', 'should', 'is', 'are', 'was', 'were', 'am' } - # Pronouns that often follow auxiliaries in Yes/No questions - pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it'} + pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it', 'these', 'those', 'this', 'that'} wh_count = 0 yesno_count = 0 counted_sentences = set() + for sent in doc.sents: sent_text = sent.text.strip() sent_tokens = list(sent) if not sent_tokens: continue - # Method 1: Find question sentences by checking for '?' at end + + # Method 1: Sentences ending with '?' if sent_text.endswith('?'): - # try to find the first WH-word in the sentence wh = False - for i in range(len(sent_tokens) - 1): - tok1 = sent_tokens[i] - tok2 = sent_tokens[i + 1] + for tok1 in sent_tokens: t1_lower = tok1.text.lower() - t2_lower = tok2.text.lower() - - # Rules for detecting WH-questions: - # tok2.dep_ != "relcl": catches relative clauses like "the book which is on the table" - # "any(t.text.lower() in wh_followers.get(t1_lower, set())" ... - # catches WH-word and main verb having a noun clause in between (e.g., "which of these options is it") - if t1_lower in wh_words and tok1.tag_ in search_tags \ - and tok1.dep_ not in {"relcl", "acl"} \ - and tok2.dep_ != "relcl" \ - and any( - t.text.lower() in wh_followers.get(t1_lower, set()) - for t in sent_tokens[i+1:] - ): - - wh = True - break + if t1_lower in wh_words and tok1.tag_ in search_tags: + if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=True): + wh = True + break if wh: wh_count += 1 else: - # Fallback: no WH in the sentence → treat as Yes/No question yesno_count += 1 counted_sentences.add(sent.start) continue - # Method 2: For remaining sentences, apply lexical rule-based detection --- Extract tokens and their metadata for fast access - for i in range(len(sent_tokens) - 1): - tok1 = sent_tokens[i] - tok2 = sent_tokens[i + 1] + + # Method 2: Lexical rule-based detection for sentences without '?' + found_question = False + for tok1 in sent_tokens: + t1_lower = tok1.text.lower() + if t1_lower in wh_words and tok1.tag_ in search_tags: + if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=False): + wh_count += 1 + counted_sentences.add(sent.start) + found_question = True + break + + if found_question: + continue + + # Check for Yes/No questions + for tok1, tok2 in zip(sent_tokens, sent_tokens[1:] + [None]): t1_lower = tok1.text.lower() - t2_lower = tok2.text.lower() - if sent.start in counted_sentences: - break # already counted - # Yes/No pattern - if t1_lower in yesno_aux and t2_lower in pronoun_followers: + t2_lower = tok2.text.lower() if tok2 else None + + if tok1.i - sent.start > 1: + continue + + if t1_lower in auxiliaries and t2_lower in pronoun_followers: yesno_count += 1 counted_sentences.add(sent.start) break - # WH pattern - if t1_lower in wh_words and tok1.tag_ in search_tags \ - and tok1.dep_ not in {"relcl", "acl"} \ - and tok2.dep_ != "relcl" \ - and any( - t.text.lower() in wh_followers.get(t1_lower, set()) - for t in sent_tokens[i+1:] - ): - wh_count += 1 - counted_sentences.add(sent.start) - break return yesno_count, wh_count From 8e1593498420be521b7bb2a08baaf6abdd53398a Mon Sep 17 00:00:00 2001 From: Xinlan Emily Hu Date: Fri, 9 Jan 2026 21:57:13 -0800 Subject: [PATCH 09/10] fixing more edge cases --- src/team_comm_tools/features/politeness_v2_helper.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py index 634559b6..d3b63c0c 100644 --- a/src/team_comm_tools/features/politeness_v2_helper.py +++ b/src/team_comm_tools/features/politeness_v2_helper.py @@ -233,6 +233,15 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False): """ Returns True if the WH-word token is part of a real main-clause question. """ + # Check if sentence starts with auxiliary (not WH-word) - indicates Yes/No question + sent_tokens = list(sent) + if len(sent_tokens) >= 1: + first_tok = sent_tokens[0] + if first_tok.text.lower() in auxiliaries and first_tok.text.lower() not in {'what', 'who', 'where', 'when', 'why', 'how', 'which'}: + # Sentence starts with auxiliary - this is a Yes/No question + # Any WH-words are being used as content, not interrogatives + return False + # For WH-determiners (both with and without ?), use special logic if tok.dep_ == "det": noun = tok.head @@ -277,7 +286,6 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False): return False # Check if WH-word is attached to a verb that takes interrogative complements - # Verbs like: tell, ask, know, wonder, understand, explain, show, see, remember, etc. complement_taking_verbs = { 'tell', 'ask', 'know', 'wonder', 'understand', 'explain', 'show', 'see', 'remember', 'forget', 'realize', 'figure', From a01178156c86968e7cd341ccefee484f58f70658 Mon Sep 17 00:00:00 2001 From: Xinlan Emily Hu Date: Sat, 10 Jan 2026 17:51:05 -0800 Subject: [PATCH 10/10] update robustness of question detector and update tests --- .../features/politeness_v2_helper.py | 63 ++++++++++++++++--- tests/data/cleaned_data/test_chat_level.csv | 30 ++++++++- 2 files changed, 82 insertions(+), 11 deletions(-) diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py index d3b63c0c..17a67602 100644 --- a/src/team_comm_tools/features/politeness_v2_helper.py +++ b/src/team_comm_tools/features/politeness_v2_helper.py @@ -238,8 +238,6 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False): if len(sent_tokens) >= 1: first_tok = sent_tokens[0] if first_tok.text.lower() in auxiliaries and first_tok.text.lower() not in {'what', 'who', 'where', 'when', 'why', 'how', 'which'}: - # Sentence starts with auxiliary - this is a Yes/No question - # Any WH-words are being used as content, not interrogatives return False # For WH-determiners (both with and without ?), use special logic @@ -280,10 +278,58 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False): return False # For other WH-words (not determiners) - # First check for complement clauses (ccomp, xcomp) - these are embedded questions - for anc in tok.ancestors: - if anc.dep_ in {"ccomp", "xcomp"}: + + # Special handling for WH-words that are subjects (nsubj, nsubjpass) + if tok.dep_ in {"nsubj", "nsubjpass"}: + # First check: is this part of a relative clause? + has_relcl_ancestor = False + for anc in tok.ancestors: + if anc.dep_ == "relcl": + has_relcl_ancestor = True + break + + if has_relcl_ancestor: + # Check if there's a noun before the WH-word (ignoring punctuation) + # This is the typical relative clause pattern: "the book, which..." + has_noun_before = False + for t in sent: + if t.i >= tok.i: + break + if t.pos_ in {"NOUN", "PROPN"}: + has_noun_before = True + + # If there's a noun before WH and it has relcl ancestor, it's a real relative clause + if has_noun_before: + return False + + # Check if there's actually a complement-taking verb before the WH-word + complement_taking_verbs = { + 'tell', 'ask', 'know', 'wonder', 'understand', 'explain', + 'show', 'see', 'remember', 'forget', 'realize', 'figure', + 'decide', 'consider', 'discover', 'find', 'learn', 'teach' + } + + has_complement_verb_before = False + for t in sent: + if t.i >= tok.i: + break + if t.pos_ == "VERB" and t.lemma_ in complement_taking_verbs: + has_complement_verb_before = True + break + + # If no complement-taking verb before WH-word, it's a main question + if not has_complement_verb_before: + # Check for an auxiliary after the WH-word + for t in sent: + if t.i > tok.i and t.text.lower() in auxiliaries: + return True return False + # If there IS a complement verb, fall through to normal checks + + # First check for complement clauses (ccomp, xcomp) - these are embedded questions + for anc in tok.ancestors: + if anc.dep_ in {"ccomp", "xcomp"}: + return False # Check if WH-word is attached to a verb that takes interrogative complements complement_taking_verbs = { @@ -333,7 +379,8 @@ def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False): if ends_with_question_mark: return True - # For non-? sentences with non-determiner WH-words + # For non-? sentences with non-determiner WH-words that are NOT nsubj + # (nsubj was already handled above) if is_in_subordinate_clause(tok, sent): return False @@ -370,7 +417,7 @@ def Question(doc): sent_tokens = list(sent) if not sent_tokens: continue - + # Method 1: Sentences ending with '?' if sent_text.endswith('?'): wh = False @@ -397,7 +444,7 @@ def Question(doc): counted_sentences.add(sent.start) found_question = True break - + if found_question: continue diff --git a/tests/data/cleaned_data/test_chat_level.csv b/tests/data/cleaned_data/test_chat_level.csv index 2c2a1f31..09fe1887 100644 --- a/tests/data/cleaned_data/test_chat_level.csv +++ b/tests/data/cleaned_data/test_chat_level.csv @@ -88,14 +88,38 @@ I respond to that too",num_block_quote_responses,2 1,A,hello,Hello_receptiveness_yeomans,1 1,B,So how should we answer this,Token_count_receptiveness_yeomans,6 1,A,We can start here. What is the question?,YesNo_Questions_receptiveness_yeomans,0 -1,B,I am not sure. Where is the rest of our team?,WH_Questions_receptiveness_yeomans,1 1,B,"Please help me figure this out, I really want to do well on this please",Please_receptiveness_yeomans,2 +1,B,I am not sure. Where is the rest of our team?,WH_Questions_receptiveness_yeomans,1 +1,B,I am not sure. Where is the rest of our team?,First_Person_Single_receptiveness_yeomans,1 +1,B,"Well please help me figure this out, I really want to do well on this please okay",factuality_politeness_convokit,1 +1B,A,is where different from why?,YesNo_Questions_receptiveness_yeomans,1 +1B,B,is where different from why?,WH_Questions_receptiveness_yeomans,0 +1B,A,could where be different from why?,YesNo_Questions_receptiveness_yeomans,1 +1B,B,could where be different from why?,WH_Questions_receptiveness_yeomans,0 +1B,A,ok so I don't really understand which one it is,YesNo_Questions_receptiveness_yeomans,0 +1B,B,ok so I don't really understand which one it is,WH_Questions_receptiveness_yeomans,0 +1B,A,"the book, which is on the table, is red",YesNo_Questions_receptiveness_yeomans,0 +1B,B,"the book, which is on the table, is red",WH_Questions_receptiveness_yeomans,0 +1B,A,"The love which we had long sought for has finally arrived.",YesNo_Questions_receptiveness_yeomans,0 +1B,B,"The love which we had long sought for has finally arrived.",WH_Questions_receptiveness_yeomans,0 +1B,A,alrighty so which was your favorite,YesNo_Questions_receptiveness_yeomans,0 +1B,B,alrighty so which was your favorite,WH_Questions_receptiveness_yeomans,1 +1B,A,"after trying all the cake, which ones were your favorites",YesNo_Questions_receptiveness_yeomans,0 +1B,B,"after trying all the cake, which ones were your favorites",WH_Questions_receptiveness_yeomans,1 +1B,A,are these the books which you were looking for,YesNo_Questions_receptiveness_yeomans,1 +1B,B,are these the books which you were looking for,WH_Questions_receptiveness_yeomans,0 +1B,A,are these the books which you were looking for?,YesNo_Questions_receptiveness_yeomans,1 +1B,B,are these the books which you were looking for?,WH_Questions_receptiveness_yeomans,0 +1B,A,"the book, which is on the table, is red?",YesNo_Questions_receptiveness_yeomans,1 +1B,B,"the book, which is on the table, is red?",WH_Questions_receptiveness_yeomans,0 +1B,A,can you tell me who it is? what is it? where?,YesNo_Questions_receptiveness_yeomans,1 +1B,B,can you tell me who it is? what is it? where?,WH_Questions_receptiveness_yeomans,2 +1B,A,with whom did Napoleon fight his first war?,YesNo_Questions_receptiveness_yeomans,0 +1B,B,with whom did Napoleon fight his first war?,WH_Questions_receptiveness_yeomans,1 2,C,Hey,Hello_receptiveness_yeomans,1 2,C,Okay bro lets split it 50/50,Impersonal_Pronoun_receptiveness_yeomans,1 2,D,Maybe but how about 60/40? I doubt its fair otherwise,Hedges_receptiveness_yeomans,2 2,C,Seems fair,Hedges_receptiveness_yeomans,1 -1,B,I am not sure. Where is the rest of our team?,First_Person_Single_receptiveness_yeomans,1 -1,B,"Well please help me figure this out, I really want to do well on this please okay",factuality_politeness_convokit,1 2,C,Seems possible,hashedge_politeness_convokit,1 2,E,I see what youre thinking but I disagree,Acknowledgement_receptiveness_yeomans,1 2,E,We get only one chance so we should understand how to split it,Acknowledgement_receptiveness_yeomans,2