diff --git a/.gitignore b/.gitignore index 524f3aed..aeadf1ab 100644 --- a/.gitignore +++ b/.gitignore @@ -58,4 +58,5 @@ node_modules/ # testing /output /vector_data -test.py \ No newline at end of file +test.py +test.ipynb \ No newline at end of file diff --git a/src/team_comm_tools/features/keywords.py b/src/team_comm_tools/features/keywords.py index a073a009..74e6ff63 100644 --- a/src/team_comm_tools/features/keywords.py +++ b/src/team_comm_tools/features/keywords.py @@ -1,3 +1,4 @@ +# reference: https://github.com/bbevis/politenessPy/blob/main/keywords.py kw = { "spacy_neg_only": { "Negative_Emotion": [ @@ -7260,7 +7261,34 @@ " sorry ", " woops ", " whoops ", - " oops " + " oops ", + " apology " + ], + "Third_Person": [ + " he ", + " him ", + " his ", + " himself ", + " she ", + " her ", + " hers ", + " herself ", + " they ", + " them ", + " their ", + " theirs ", + " themselves " + ], + "Contrast_Conjunction": [ + " but ", + " however ", + " instead ", + " although ", + " even though ", + " despite ", + " and yet ", + " nevertheless ", + " nonetheless " ], "Ask_Agency": [ " do me a favor ", @@ -7365,7 +7393,6 @@ "Gratitude": [ " thank ", " thanks ", - " thank you ", " grateful ", " gratitude ", " cheers " @@ -14419,25 +14446,47 @@ " cock ", " crap ", " damn ", + " dammit ", + " damnit ", " dick ", + " dickhead ", + " dick-head ", " dumb ", + " dumbass ", + " dumb-ass ", + " dumb ass ", " dyke ", " fuck ", + " fucking ", + " fucker ", " goddam ", + " goddammit ", + " goddamed ", " hell ", + " horshit ", " homo ", + " jackass ", + " jackass ", + " motherfucker ", + " mother-fucker ", + " motherfucking ", " nigger ", + " nigra ", " piss ", " prick ", " pussy ", " queer ", " screw ", " shit ", + " shite ", + " shitting ", " sob ", - " sonofa ", " suck ", " sucked ", - " sucks " + " sucks ", + " twat ", + " wanker ", + " whore " ], "Truth_Intensifier": [ " really ", diff --git a/src/team_comm_tools/features/politeness_v2_helper.py b/src/team_comm_tools/features/politeness_v2_helper.py index 0385acd6..17a67602 100644 --- a/src/team_comm_tools/features/politeness_v2_helper.py +++ b/src/team_comm_tools/features/politeness_v2_helper.py @@ -215,32 +215,253 @@ def bare_command(doc): return len(bc) +def is_in_subordinate_clause(tok, sent): + """ + Check if a token is inside a subordinate clause rather than the main clause. + """ + # Walk up from the token's head (not the token itself) + current = tok + while current.head != current and current != sent.root: + # Check if the HEAD has a subordinate clause dependency + if current.head.dep_ in {"advcl", "relcl", "acl", "ccomp", "xcomp"} and current.head != sent.root: + # We're attached to something that's a subordinate clause + return True + current = current.head + return False + +def wh_is_real_question(tok, sent, auxiliaries, ends_with_question_mark=False): + """ + Returns True if the WH-word token is part of a real main-clause question. + """ + # Check if sentence starts with auxiliary (not WH-word) - indicates Yes/No question + sent_tokens = list(sent) + if len(sent_tokens) >= 1: + first_tok = sent_tokens[0] + if first_tok.text.lower() in auxiliaries and first_tok.text.lower() not in {'what', 'who', 'where', 'when', 'why', 'how', 'which'}: + return False + + # For WH-determiners (both with and without ?), use special logic + if tok.dep_ == "det": + noun = tok.head + + # Check: is the noun inside a complement clause? + current = noun + while current.head != current and current != sent.root: + if current.dep_ in {"ccomp", "xcomp"}: + return False + current = current.head + + # If the noun is a subject (nsubj) and has a relcl ancestor, it's likely a misparsed question + if noun.dep_ in {"nsubj", "nsubjpass"}: + # This looks like a question with the WH-noun as subject + # Check for auxiliary + for t in sent: + if t.i > noun.i and t.text.lower() in auxiliaries: + return True + + # If sentence ends with ?, accept it + if ends_with_question_mark: + return True + + # For non-subject WH-determiners, check close ancestors for relcl + if tok.dep_ == "det" and tok.head.dep_ != "relcl": + # Check head and head's head + if tok.head.head.dep_ == "relcl" and tok.head.head.i < tok.i: + # relcl is before WH-word, likely a real relative clause + return False + + # Check if there's an auxiliary after the WH-word/noun + for t in sent: + if t.i > noun.i and t.text.lower() in auxiliaries: + return True + + return False + + # For other WH-words (not determiners) + + # Special handling for WH-words that are subjects (nsubj, nsubjpass) + if tok.dep_ in {"nsubj", "nsubjpass"}: + # First check: is this part of a relative clause? + has_relcl_ancestor = False + for anc in tok.ancestors: + if anc.dep_ == "relcl": + has_relcl_ancestor = True + break + + if has_relcl_ancestor: + # Check if there's a noun before the WH-word (ignoring punctuation) + # This is the typical relative clause pattern: "the book, which..." + has_noun_before = False + for t in sent: + if t.i >= tok.i: + break + if t.pos_ in {"NOUN", "PROPN"}: + has_noun_before = True + + # If there's a noun before WH and it has relcl ancestor, it's a real relative clause + if has_noun_before: + return False + + # Check if there's actually a complement-taking verb before the WH-word + complement_taking_verbs = { + 'tell', 'ask', 'know', 'wonder', 'understand', 'explain', + 'show', 'see', 'remember', 'forget', 'realize', 'figure', + 'decide', 'consider', 'discover', 'find', 'learn', 'teach' + } + + has_complement_verb_before = False + for t in sent: + if t.i >= tok.i: + break + if t.pos_ == "VERB" and t.lemma_ in complement_taking_verbs: + has_complement_verb_before = True + break + + # If no complement-taking verb before WH-word, it's a main question + if not has_complement_verb_before: + # Check for an auxiliary after the WH-word + for t in sent: + if t.i > tok.i and t.text.lower() in auxiliaries: + return True + return False + # If there IS a complement verb, fall through to normal checks + + # First check for complement clauses (ccomp, xcomp) - these are embedded questions + for anc in tok.ancestors: + if anc.dep_ in {"ccomp", "xcomp"}: + return False + + # Check if WH-word is attached to a verb that takes interrogative complements + complement_taking_verbs = { + 'tell', 'ask', 'know', 'wonder', 'understand', 'explain', + 'show', 'see', 'remember', 'forget', 'realize', 'figure', + 'decide', 'consider', 'discover', 'find', 'learn', 'teach' + } + + if tok.head.pos_ == "VERB" and tok.head.lemma_ in complement_taking_verbs: + # Check if there are tokens before this verb (indicating it's not sentence-initial) + tokens_before_verb = 0 + for t in sent: + if t.i >= tok.head.i: + break + if t.pos_ not in {"PUNCT", "INTJ"}: + tokens_before_verb += 1 + + # If there are 2+ tokens before the verb, WH is likely embedded + if tokens_before_verb >= 2: + return False + + # Check if has relcl ancestor + has_relcl_ancestor = False + for anc in tok.ancestors: + if anc.dep_ == "relcl": + has_relcl_ancestor = True + break + + if has_relcl_ancestor: + # Check if this is a misparsed main question vs real relative clause + # Count substantive tokens before the WH-word + substantive_before = 0 + for t in sent: + if t.i >= tok.i: + break + if t.pos_ not in {"INTJ", "PUNCT", "CCONJ", "DET"}: + substantive_before += 1 + + # If fewer than 3 substantive tokens before WH, likely a misparsed main question + if substantive_before < 3: + pass # Don't exclude it + else: + # Likely a real relative clause + return False + + # If the sentence ends with ?, be lenient for non-relcl WH-words + if ends_with_question_mark: + return True + + # For non-? sentences with non-determiner WH-words that are NOT nsubj + # (nsubj was already handled above) + if is_in_subordinate_clause(tok, sent): + return False + + if tok.dep_ not in {"nsubj", "nsubjpass", "csubj", "attr", "ROOT", "dobj", "pobj", "advmod"}: + return False + + for t in sent: + if not is_in_subordinate_clause(t, sent) and t.text.lower() in auxiliaries: + return True + + return False + def Question(doc): """ Counts the number of sentences containing question words and question marks. - - Args: - doc (spacy.tokens.Doc): The spaCy Doc object containing the text to be analyzed. - - Returns: - tuple: A tuple containing the counts of Yes/No questions and WH-questions. """ - - keywords = set([' who ', ' what ', ' where ', ' when ', ' why ', ' how ', ' which ']) - tags = set(['WRB', 'WP', 'WDT']) - - # doc = nlp(text) - sentences = [str(sent) for sent in doc.sents if '?' in str(sent)] - all_qs = len(sentences) - - n = 0 - for i in range(len(sentences)): - whq = [token.tag_ for token in nlp(sentences[i]) if token.tag_ in tags] - - if len(whq) > 0: - n += 1 - - return all_qs - n, n + search_tags = {'WRB', 'WP', 'WDT', 'WP$'} + wh_words = {'what', 'who', 'where', 'when', 'why', 'how', 'which', 'whose', 'whom'} + + auxiliaries = { + 'do', 'does', 'did', 'have', 'has', 'had', + 'can', 'could', 'will', 'would', + 'may', 'might', 'shall', 'should', + 'is', 'are', 'was', 'were', 'am' + } + pronoun_followers = {'i', 'you', 'we', 'he', 'she', 'they', 'it', 'these', 'those', 'this', 'that'} + + wh_count = 0 + yesno_count = 0 + counted_sentences = set() + + for sent in doc.sents: + sent_text = sent.text.strip() + sent_tokens = list(sent) + if not sent_tokens: + continue + + # Method 1: Sentences ending with '?' + if sent_text.endswith('?'): + wh = False + for tok1 in sent_tokens: + t1_lower = tok1.text.lower() + if t1_lower in wh_words and tok1.tag_ in search_tags: + if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=True): + wh = True + break + if wh: + wh_count += 1 + else: + yesno_count += 1 + counted_sentences.add(sent.start) + continue + + # Method 2: Lexical rule-based detection for sentences without '?' + found_question = False + for tok1 in sent_tokens: + t1_lower = tok1.text.lower() + if t1_lower in wh_words and tok1.tag_ in search_tags: + if wh_is_real_question(tok1, sent, auxiliaries, ends_with_question_mark=False): + wh_count += 1 + counted_sentences.add(sent.start) + found_question = True + break + + if found_question: + continue + + # Check for Yes/No questions + for tok1, tok2 in zip(sent_tokens, sent_tokens[1:] + [None]): + t1_lower = tok1.text.lower() + t2_lower = tok2.text.lower() if tok2 else None + + if tok1.i - sent.start > 1: + continue + + if t1_lower in auxiliaries and t2_lower in pronoun_followers: + yesno_count += 1 + counted_sentences.add(sent.start) + break + + return yesno_count, wh_count def word_start(keywords, doc): diff --git a/tests/data/cleaned_data/test_chat_level.csv b/tests/data/cleaned_data/test_chat_level.csv index a463c715..09fe1887 100644 --- a/tests/data/cleaned_data/test_chat_level.csv +++ b/tests/data/cleaned_data/test_chat_level.csv @@ -88,14 +88,38 @@ I respond to that too",num_block_quote_responses,2 1,A,hello,Hello_receptiveness_yeomans,1 1,B,So how should we answer this,Token_count_receptiveness_yeomans,6 1,A,We can start here. What is the question?,YesNo_Questions_receptiveness_yeomans,0 -1,B,I am not sure. Where is the rest of our team?,WH_Questions_receptiveness_yeomans,1 1,B,"Please help me figure this out, I really want to do well on this please",Please_receptiveness_yeomans,2 +1,B,I am not sure. Where is the rest of our team?,WH_Questions_receptiveness_yeomans,1 +1,B,I am not sure. Where is the rest of our team?,First_Person_Single_receptiveness_yeomans,1 +1,B,"Well please help me figure this out, I really want to do well on this please okay",factuality_politeness_convokit,1 +1B,A,is where different from why?,YesNo_Questions_receptiveness_yeomans,1 +1B,B,is where different from why?,WH_Questions_receptiveness_yeomans,0 +1B,A,could where be different from why?,YesNo_Questions_receptiveness_yeomans,1 +1B,B,could where be different from why?,WH_Questions_receptiveness_yeomans,0 +1B,A,ok so I don't really understand which one it is,YesNo_Questions_receptiveness_yeomans,0 +1B,B,ok so I don't really understand which one it is,WH_Questions_receptiveness_yeomans,0 +1B,A,"the book, which is on the table, is red",YesNo_Questions_receptiveness_yeomans,0 +1B,B,"the book, which is on the table, is red",WH_Questions_receptiveness_yeomans,0 +1B,A,"The love which we had long sought for has finally arrived.",YesNo_Questions_receptiveness_yeomans,0 +1B,B,"The love which we had long sought for has finally arrived.",WH_Questions_receptiveness_yeomans,0 +1B,A,alrighty so which was your favorite,YesNo_Questions_receptiveness_yeomans,0 +1B,B,alrighty so which was your favorite,WH_Questions_receptiveness_yeomans,1 +1B,A,"after trying all the cake, which ones were your favorites",YesNo_Questions_receptiveness_yeomans,0 +1B,B,"after trying all the cake, which ones were your favorites",WH_Questions_receptiveness_yeomans,1 +1B,A,are these the books which you were looking for,YesNo_Questions_receptiveness_yeomans,1 +1B,B,are these the books which you were looking for,WH_Questions_receptiveness_yeomans,0 +1B,A,are these the books which you were looking for?,YesNo_Questions_receptiveness_yeomans,1 +1B,B,are these the books which you were looking for?,WH_Questions_receptiveness_yeomans,0 +1B,A,"the book, which is on the table, is red?",YesNo_Questions_receptiveness_yeomans,1 +1B,B,"the book, which is on the table, is red?",WH_Questions_receptiveness_yeomans,0 +1B,A,can you tell me who it is? what is it? where?,YesNo_Questions_receptiveness_yeomans,1 +1B,B,can you tell me who it is? what is it? where?,WH_Questions_receptiveness_yeomans,2 +1B,A,with whom did Napoleon fight his first war?,YesNo_Questions_receptiveness_yeomans,0 +1B,B,with whom did Napoleon fight his first war?,WH_Questions_receptiveness_yeomans,1 2,C,Hey,Hello_receptiveness_yeomans,1 2,C,Okay bro lets split it 50/50,Impersonal_Pronoun_receptiveness_yeomans,1 2,D,Maybe but how about 60/40? I doubt its fair otherwise,Hedges_receptiveness_yeomans,2 2,C,Seems fair,Hedges_receptiveness_yeomans,1 -1,B,I am not sure. Where is the rest of our team?,First_Person_Single_receptiveness_yeomans,1 -1,B,"Well please help me figure this out, I really want to do well on this please okay",factuality_politeness_convokit,1 2,C,Seems possible,hashedge_politeness_convokit,1 2,E,I see what youre thinking but I disagree,Acknowledgement_receptiveness_yeomans,1 2,E,We get only one chance so we should understand how to split it,Acknowledgement_receptiveness_yeomans,2 @@ -1437,11 +1461,17 @@ yeomans_test,yeomans_user_b,I guess they almost complete it.,Hedges_receptivenes yeomans_test,yeomans_user_a,"Ma'am, no my lady, do you know Mr. Smith?",Formal_Title_receptiveness_yeomans,2 yeomans_test,yeomans_user_b,"I agree, this is correct",Agreement_receptiveness_yeomans,2 yeomans_test,yeomans_user_a,This is for you. Why you don't understand it's for you?,For_You_receptiveness_yeomans,2 -yeomans_test,yeomans_user_b,"Thank you, I'm really grateful",Gratitude_receptiveness_yeomans,3 +yeomans_test,yeomans_user_b,"Thank you, I'm really grateful",Gratitude_receptiveness_yeomans,2 yeomans_test,yeomans_user_a,We here you. We totally understand,Acknowledgement_receptiveness_yeomans,2 yeomans_test,yeomans_user_b,"Shit. You dumb asshole, what the hell? Who's that bastard? Suck my dick.",Swearing_receptiveness_yeomans,7 yeomans_test,yeomans_user_a,"Hey hello good morning, oh actually good evening.",Hello_receptiveness_yeomans,4 yeomans_test,yeomans_user_b,Are you sure? Is this the guy? Did he lie to you?,YesNo_Questions_receptiveness_yeomans,3 +yeomans_test,yeomans_user_a,"Did you finish the report, which was due today?",YesNo_Questions_receptiveness_yeomans,1 +yeomans_test,yeomans_user_b,"Did you finish the report, which was due today?",WH_Questions_receptiveness_yeomans,0 +yeomans_test,yeomans_user_a,"We can start here. What is the question?",WH_Questions_receptiveness_yeomans,1 +yeomans_test,yeomans_user_b,"Has she met the teacher who helped you last year?",WH_Questions_receptiveness_yeomans,0 +yeomans_test,yeomans_user_a,"Do you know what time it is?",WH_Questions_receptiveness_yeomans,0 +yeomans_test,yeomans_user_b,"Have you read the article that explains why this happens?",WH_Questions_receptiveness_yeomans,0 yeomans_test,yeomans_user_a,I'm sorry I sincerely apologize.,Apology_receptiveness_yeomans,2 yeomans_test,yeomans_user_b,Wow! Amazing! Perfect!,Affirmation_receptiveness_yeomans,3 yeomans_test,yeomans_user_a,I love you. My friend,First_Person_Single_receptiveness_yeomans,2