diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml deleted file mode 100644 index 29bfc57..0000000 --- a/.github/workflows/deploy.yaml +++ /dev/null @@ -1,14 +0,0 @@ -name: Deploy to GitHub Pages - -permissions: - contents: write - pages: write - -on: - push: - branches: [ "main", "master" ] - workflow_dispatch: -jobs: - deploy: - runs-on: ubuntu-latest - steps: [uses: fastai/workflows/quarto-ghp@master] diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml deleted file mode 100644 index 5608592..0000000 --- a/.github/workflows/test.yaml +++ /dev/null @@ -1,7 +0,0 @@ -name: CI -on: [workflow_dispatch, pull_request, push] - -jobs: - test: - runs-on: ubuntu-latest - steps: [uses: fastai/workflows/nbdev-ci@master] diff --git a/.gitignore b/.gitignore index 900add7..8d86c7b 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,9 @@ checklink/cookies.txt # Quarto .quarto + +checkpoints/ + +wandb/* + +*.parquet \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 3b106e8..0000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2022, fastai - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 5c0e7ce..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,5 +0,0 @@ -include settings.ini -include LICENSE -include CONTRIBUTING.md -include README.md -recursive-exclude * __pycache__ diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/textbooks_A2YN/gpt_labeling.py b/examples/textbooks_A2YN/gpt_labeling.py deleted file mode 100644 index b19c72e..0000000 --- a/examples/textbooks_A2YN/gpt_labeling.py +++ /dev/null @@ -1,49 +0,0 @@ -import os - -from datasets import concatenate_datasets, load_dataset -from squeakily.helpers import LLMLabeler -from treasure_trove.core import label_dataset - -instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. -High quality code has the following: -* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure. -* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects. -* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles. -* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain. -Medium quality code has the following: -* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names. -* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions. -* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose. -* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices. -Low quality code has the following: -* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names. -* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions. -* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose. -* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend. - -Output nothing other than one of the following labels: -""" - -labels = ["high quality", "medium quality", "low quality"] -api_key = os.environ["OPENAI_KEY"] -labeler = LLMLabeler(instruction, labels, model_name="gpt-4", api_key=api_key) # gpt-3.5-turbo - -languages = ["python", "go", "java", "javascript", "c", "c++"] -subsets = [] -for lang in languages: - ds = load_dataset("bigcode/the-stack-smol", data_dir=f"data/{lang}")["train"] - sample = 50 / len(ds) - subset = label_dataset(ds, "content", labeler, labels, sample=sample, num_workers=8) - new_column = [lang] * len(subset) - subset = subset.add_column("language", new_column) - subsets.append(subset) - -labeled_ds = concatenate_datasets(subsets) - -# upload to huggingface -labeled_ds.push_to_hub("CarperAI/textbooks_A2YN_labeled_six_languages", private=True) - -# print number of each class -print(f"Number of {labels[0]}: {len(labeled_ds.filter(lambda x: x['label'] == 0))}") -print(f"Number of {labels[1]}: {len(labeled_ds.filter(lambda x: x['label'] == 1))}") -print(f"Number of {labels[2]}: {len(labeled_ds.filter(lambda x: x['label'] == 2))}") diff --git a/examples/textbooks_A2YN/train_labeler.py b/examples/textbooks_A2YN/train_labeler.py deleted file mode 100644 index 1249930..0000000 --- a/examples/textbooks_A2YN/train_labeler.py +++ /dev/null @@ -1,37 +0,0 @@ -from datasets import load_dataset -from transformers import pipeline, TrainingArguments -from treasure_trove.core import filter_dataset, label_dataset, train_labeler - - -ds = load_dataset("CarperAI/textbooks_A2YN_labeled")["train"] -batch_size = 32 -training_args = TrainingArguments( - output_dir="./code_edu", - num_train_epochs=3, - per_device_train_batch_size=batch_size, - per_device_eval_batch_size=batch_size, - warmup_steps=500, - weight_decay=0.01, - logging_dir="./logs", - logging_steps=10, - evaluation_strategy="epoch", - save_strategy="epoch", - load_best_model_at_end=True, - metric_for_best_model="accuracy", - greater_is_better=True, - seed=42, - push_to_hub=True, - hub_model_id="CarperAI/code_edu_classifier_py", - hub_private_repo=True, -) -base_model_name = "bigcode/starencoder" -model, tokenizer = train_labeler( - ds, - "content", - base_model_name, - n_labels=2, - training_args=training_args, - num_workers=4, - max_length=512, - push_to_hub=True, -) \ No newline at end of file diff --git a/generate_embeddings.py b/generate_embeddings.py new file mode 100644 index 0000000..0e814d5 --- /dev/null +++ b/generate_embeddings.py @@ -0,0 +1,192 @@ +from abc import ABC +from datasets import ( + load_dataset, +) +from dotenv import load_dotenv +import torch +from typing import Union, List, Dict + +from train_labeler import EncoderParams + +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + AutoModel, +) + +load_dotenv(".env") + +# https://huggingface.co/bigcode/starencoder/discussions/3 +# https://github.com/bigcode-project/bigcode-encoder/blob/master/embedding_sandbox.ipynb + + +# https://github.com/bigcode-project/bigcode-encoder/blob/master/src/utils.py#L152 +def pooling(x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """Pools a batch of vector sequences into a batch of vector global representations. + It does so by taking the last vector in the sequence, as indicated by the mask. + + Args: + x (torch.Tensor): Batch of vector sequences with shape [B, T, F]. + mask (torch.Tensor): Batch of masks with shape [B, T]. + + Returns: + torch.Tensor: Pooled version of the input batch with shape [B, F]. + """ + + eos_idx = mask.sum(1) - 1 + batch_idx = torch.arange(len(eos_idx), device=x.device) + + mu = x[batch_idx, eos_idx, :] + + return mu + + +# https://github.com/bigcode-project/bigcode-encoder/blob/master/src/utils.py#L121 +def pool_and_normalize( + features_sequence: torch.Tensor, + attention_masks: torch.Tensor, + return_norms: bool = False, +) -> Union[torch.Tensor, List[torch.Tensor]]: + """Temporal pooling of sequences of vectors and projection onto the unit sphere. + + Args: + features_sequence (torch.Tensor): Inpute features with shape [B, T, F]. + attention_masks (torch.Tensor): Pooling masks with shape [B, T, F]. + return_norms (bool, optional): Whether to additionally return the norms. Defaults to False. + + Returns: + Union[torch.Tensor, List[torch.Tensor]]: Pooled and normalized vectors with shape [B, F]. + """ + + pooled_embeddings = pooling(features_sequence, attention_masks) + embedding_norms = pooled_embeddings.norm(dim=1) + + normalizing_factor = torch.where( # Only normalize embeddings with norm > 1.0. + embedding_norms > 1.0, embedding_norms, torch.ones_like(embedding_norms) + ) + + pooled_normalized_embeddings = pooled_embeddings / normalizing_factor[:, None] + + if return_norms: + return pooled_normalized_embeddings, embedding_norms + else: + return pooled_normalized_embeddings + + +# https://github.com/bigcode-project/bigcode-encoder/blob/master/src/constants.py + + +def set_device(inputs: Dict[str, torch.Tensor], device: str) -> Dict[str, torch.Tensor]: + output_data = {} + for k, v in inputs.items(): + output_data[k] = v.to(device) + + return output_data + + +def prepare_tokenizer(tokenizer_path): + try: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + except OSError: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_auth_token=True) + + tokenizer.add_special_tokens({"pad_token": EncoderParams.PAD_TOKEN}) + tokenizer.add_special_tokens({"sep_token": EncoderParams.SEPARATOR_TOKEN}) + tokenizer.add_special_tokens({"cls_token": EncoderParams.CLS_TOKEN}) + tokenizer.add_special_tokens({"mask_token": EncoderParams.MASK_TOKEN}) + return tokenizer + + +def truncate_sentences( + sentence_list: List[str], maximum_length: Union[int, float] +) -> List[str]: + truncated_sentences = [] + + for sentence in sentence_list: + truncated_sentences.append(sentence[:maximum_length]) + + return truncated_sentences + + +class StarEncoder(torch.nn.Module): + def __init__(self, device): + super().__init__() + + self.tokenizer = prepare_tokenizer(EncoderParams.base_model_name) + self.encoder = ( + AutoModel.from_pretrained( + EncoderParams.base_model_name, use_auth_token=True + ) + .to(device) + .eval() + ) + self.device = device + self.max_input_len = EncoderParams.max_input_length + self.maximum_token_len = EncoderParams.max_token_length + + def forward(self, input_sentences): + inputs = self.tokenizer( + [ + f"{EncoderParams.CLS_TOKEN}{sentence}{EncoderParams.SEPARATOR_TOKEN}" + for sentence in input_sentences + ], + padding="longest", + max_length=self.maximum_token_len, + truncation=True, + return_tensors="pt", + ) + + outputs = self.encoder(**set_device(inputs, self.device)) + embedding = pool_and_normalize(outputs.hidden_states[-1], inputs.attention_mask) + + return embedding + + def encode(self, input_sentences, batch_size=32, **kwargs): + truncated_input_sentences = truncate_sentences( + input_sentences, self.max_input_len + ) + + n_batches = len(truncated_input_sentences) // batch_size + int( + len(truncated_input_sentences) % batch_size > 0 + ) + + embedding_batch_list = [] + + for i in range(n_batches): + start_idx = i * batch_size + end_idx = min((i + 1) * batch_size, len(truncated_input_sentences)) + + with torch.no_grad(): + embedding_batch_list.append( + self.forward(truncated_input_sentences[start_idx:end_idx]) + .detach() + .cpu() + ) + + input_sentences_embedding = torch.cat(embedding_batch_list) + + return input_sentences_embedding + + +tokenizer = AutoTokenizer.from_pretrained( + EncoderParams.base_model_name, max_length=EncoderParams.max_token_length +) +if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + +dataset = load_dataset("roborovski/phi-1") + +device = torch.device("cuda") +model = StarEncoder(device) + + +def process(x): + content = x["content"] + embedding = model.encode(content) + return {"embedding": embedding} + + +# process(dataset["train"][0]) + +processed_dataset = dataset.map(process, batched=True, batch_size=128) +processed_dataset.push_to_hub("roborovski/phi-2-embeddings") diff --git a/gpt_labeling.py b/gpt_labeling.py new file mode 100644 index 0000000..9f5eac1 --- /dev/null +++ b/gpt_labeling.py @@ -0,0 +1,105 @@ +import os +from pathlib import Path + +from datasets import ( + concatenate_datasets, + load_dataset, + IterableDataset, + Dataset, + ReadInstruction, +) +from dotenv import load_dotenv + +import time +from treasure_trove.core import ChatGPTLabeler, instruction + +load_dotenv(".env") +labels = ["high quality", "medium quality", "low quality"] +secondary_labels = ["high", "medium", "low"] +lang = "python" +max_chars = 4_096 +num_workers = 8 +labeler = ChatGPTLabeler( + instruction, + labels, + secondary_labels=secondary_labels, +) +dataset_chunks = [] + +buffer_size = 500 +num_chunks = 100 + +print("Loading dataset..") +print("Loaded dataset.") + +api_key = os.environ["OPENAI_KEY"] + +max_failures = 5 +failures = 0 + +ckpt_dir = "./checkpoints" +Path(ckpt_dir).mkdir(exist_ok=True) + + +def process(x): + failures = 0 + total_cost = 0 + label_idx, cost_info = 0, {} + while failures < max_failures: + try: + label_idx, cost_info = labeler(x["content"][:max_chars]) + time.sleep(1) + break + except Exception as e: + failures += 1 + print(e) + time.sleep(1) + if cost_info: + total_cost = cost_info["total_cost"] + print( + f"{label_idx} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']} | {cost_info['total_cost']}" + ) + else: + print("row not classified.") + return {"label": label_idx, "cost": total_cost} + + +processed_chunk_datasets = [] + +first_save_idx = 8000 + +for i in range(num_chunks): + split = ReadInstruction( + "train", from_=i * buffer_size, to=(i + 1) * buffer_size, unit="abs" + ) + # if i < first_save_idx // buffer_size: + # print(f"skipping chunk {i}: {split}") + # continue + print(f"processing chunk {i}: {split}") + subset = load_dataset( + "parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"} + ) + + # Label the subset + subset = subset.map(process, batched=False, num_proc=4) + + processed_chunk_datasets.append(subset) + + if i > first_save_idx // buffer_size: + all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets) + try: + all_datasets.push_to_hub("roborovski/phi-1", private=True) + all_datasets.to_parquet(os.path.join(ckpt_dir, f"processed_{i}")) + except Exception as e: + print(e) + + # print number of each class + print( + f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}" + ) + print( + f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}" + ) + print( + f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}" + ) diff --git a/llama_inference.py b/llama_inference.py new file mode 100644 index 0000000..27daa08 --- /dev/null +++ b/llama_inference.py @@ -0,0 +1,49 @@ +from transformers import AutoTokenizer +import transformers +import torch + +model = "../llama-7bf-hf" + +instruction_simple = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. +High quality code has the following: +* Readability: The code is written in a way that is easy to understand and follow. +* Modularity: The code is organized into reusable and independent modules or functions. +* Detailed explanations: The code is accompanied by explanations of the concepts used. +* Good design principles: The code follows best practices for software design. +Medium quality code has the following: +* Readability: The code is reasonably well-structured and readable. +* Partial modularity: The code contains some reusable components. +* Some explanations: The code may have limited explanations or comments. +* Adequate design principles: The code follows basic design principles. +Low quality code has the following: +* Poor readability: The code is poorly structured and difficult to follow. +* No modularity: The code is written in a monolithic style. +* Limited explanations: The code provides minimal or no explanations. +* Neglects design principles: The code shows a lack of consideration for design principles. + +Output nothing other than one of the following labels: +High quality +Medium quality +Low quality +""" + + +tokenizer = AutoTokenizer.from_pretrained(model) +pipeline = transformers.pipeline( + "conversational", + model=model, + torch_dtype=torch.float16, + device_map="auto", +) + +sequences = pipeline( + instruction_simple, + do_sample=True, + top_k=10, + num_return_sequences=1, + eos_token_id=tokenizer.eos_token_id, + max_length=200, +) +for seq in sequences: + print(f"Result: {seq['generated_text']}") + diff --git a/llama_labeling.py b/llama_labeling.py new file mode 100644 index 0000000..0cffef9 --- /dev/null +++ b/llama_labeling.py @@ -0,0 +1,155 @@ +from typing import Optional, List + +import fire +import re + +from llama import Llama + + +instruction_simple = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. +High quality code has the following: +* Readability: The code is written in a way that is easy to understand and follow. +* Modularity: The code is organized into reusable and independent modules or functions. +* Detailed explanations: The code is accompanied by explanations of the concepts used. +* Good design principles: The code follows best practices for software design. +Medium quality code has the following: +* Readability: The code is reasonably well-structured and readable. +* Partial modularity: The code contains some reusable components. +* Some explanations: The code may have limited explanations or comments. +* Adequate design principles: The code follows basic design principles. +Low quality code has the following: +* Poor readability: The code is poorly structured and difficult to follow. +* No modularity: The code is written in a monolithic style. +* Limited explanations: The code provides minimal or no explanations. +* Neglects design principles: The code shows a lack of consideration for design principles. + +Output nothing other than one of the following labels: +High quality +Medium quality +Low quality +""" + + +def find_label(text: str, labels: List[str]): + for i, label in enumerate(labels): + pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE) + match = re.search(pattern, text) + if bool(match): + return i + return None + + +import os +from pathlib import Path + +from datasets import ( + concatenate_datasets, + load_dataset, + IterableDataset, + Dataset, + ReadInstruction, +) +from dotenv import load_dotenv + +import time + +load_dotenv(".env") +labels = ["high quality", "medium quality", "low quality"] +secondary_labels = ["high", "medium", "low"] +lang = "python" +max_chars = 4_096 +num_workers = 8 +dataset_chunks = [] + +buffer_size = 500 +num_chunks = 100 + +print("Loading dataset..") +print("Loaded dataset.") + +max_failures = 5 +failures = 0 + +max_gen_len = 512 +max_seq_len = 1024 +temperature = 0.1 +top_p = 0.2 +max_batch_size = 4 + + +ckpt_dir = "../llama/7Bf" +tokenizer_path = "../llama/tokenizer.model" + +generator = Llama.build( + ckpt_dir=ckpt_dir, + tokenizer_path=tokenizer_path, + max_seq_len=max_seq_len, + max_batch_size=max_batch_size, +) + + +def process(x): + total_cost = 0 + label_idx = 0 + dialogs = [] + for i in range(len(x["content"])): + code_sample = x["content"][i][:max_gen_len] + dialogs.append( + [ + {"role": "system", "content": instruction_simple}, + {"role": "user", "content": code_sample}, + ] + ) + results = generator.chat_completion( + dialogs, # type: ignore + max_gen_len=max_gen_len, + temperature=temperature, + top_p=top_p, + ) + batch_labels = [] + for i in range(len(dialogs)): + completion_text = results[i]["generation"]["content"] + label = find_label(completion_text, labels) + batch_labels.append(label) + return {"label": batch_labels} + + +processed_chunk_datasets = [] + +first_save_idx = 8000 + +for i in range(num_chunks): + split = ReadInstruction( + "train", from_=i * buffer_size, to=(i + 1) * buffer_size, unit="abs" + ) + # if i < first_save_idx // buffer_size: + # print(f"skipping chunk {i}: {split}") + # continue + print(f"processing chunk {i}: {split}") + subset = load_dataset( + "parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"} + ) + + # Label the subset + subset = subset.map(process, batched=True, batch_size=max_batch_size, num_proc=1) + + processed_chunk_datasets.append(subset) + + if i > first_save_idx // buffer_size: + all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets) + try: + all_datasets.push_to_hub("roborovski/phi-1", private=True) + all_datasets.to_parquet(os.path.join(ckpt_dir, f"processed_{i}")) + except Exception as e: + print(e) + + # print number of each class + print( + f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}" + ) + print( + f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}" + ) + print( + f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}" + ) diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb deleted file mode 100644 index 7e7aea7..0000000 --- a/nbs/00_core.ipynb +++ /dev/null @@ -1,513 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# core\n", - "\n", - "> Fill in a module description here" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | default_exp core" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | export\n", - "import evaluate\n", - "import time\n", - "\n", - "import numpy as np\n", - "\n", - "from transformers import (\n", - " AutoModelForSequenceClassification,\n", - " AutoTokenizer,\n", - " DataCollatorWithPadding,\n", - " Trainer,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | hide\n", - "from nbdev.showdoc import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | export\n", - "def classify(x, labels, llm_labeler, max_failures=5, default_label=0):\n", - " failures = 0\n", - " while failures < max_failures:\n", - " try:\n", - " label = labels.index(llm_labeler(x)[0])\n", - " time.sleep(1)\n", - " return label\n", - " except Exception as e:\n", - " failures += 1\n", - " print(e)\n", - " time.sleep(1)\n", - " pass\n", - " if failures == max_failures:\n", - " return default_label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | export\n", - "def label_dataset(\n", - " dataset, text_column, labeler_model, labels, sample=0.1, num_workers=4, max_chars=4_096\n", - "):\n", - " \"\"\"\n", - " Filters a dataset using a labeler model.\n", - "\n", - " Args:\n", - " dataset (datasets.Dataset): Dataset to filter\n", - " text_column (str): Name of the column containing the text to classify\n", - " labeler_model (Any): Model to use for labeling\n", - " labels (List[str]): List of labels\n", - " sample (float): The fraction of the dataset to label and use for filtering\n", - " batch_size (int): Batch size for labeling\n", - " num_workers (int): Number of workers for labeling\n", - " max_chars (int): Maximum number of characters to truncate the text to before labeling (reduces rate limiting errors)\n", - " \"\"\"\n", - "\n", - " # Get a subset of the dataset\n", - " subset = dataset.shuffle(seed=115).select(range(int(len(dataset) * sample)))\n", - "\n", - " # Label the subset\n", - " subset = subset.map(\n", - " lambda x: {\"label\": classify(x[text_column][:max_chars], labels, labeler_model)},\n", - " batched=False,\n", - " num_proc=num_workers,\n", - " )\n", - "\n", - " return subset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using custom data configuration bigcode--the-stack-smol-8f8055c3a4e4b4e3\n", - "Found cached dataset json (/home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-8f8055c3a4e4b4e3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cfb95116fc20477bb047848972658d69", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00 Find the treasure in your trove of data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "from datasets import load_dataset\n", - "from squeakily.helpers import LLMLabeler\n", - "from transformers import pipeline, TrainingArguments\n", - "from treasure_trove.core import filter_dataset, label_dataset, train_labeler\n", - "\n", - "instruction = \"\"\"Please label the following code as either educational or non-educational.\n", - "Educational code is code that is well written, follows best practices, has documentation such that it might be found in a textbook.\n", - "Non-educational code is code that is poorly written, lacks documentation, contain bugs, or is not idiomatic.\n", - "Labels:\n", - "\"\"\"\n", - "labels = [\"educational\", \"non-educational\"]\n", - "api_key = \"\"\n", - "labeler = LLMLabeler(instruction, labels, model_name=\"gpt-4\", api_key=api_key)\n", - "\n", - "ds = load_dataset(\"bigcode/the-stack-smol\", data_dir=\"data/python\")[\"train\"]\n", - "\n", - "# Get the training arguments\n", - "batch_size=4,\n", - "training_args = TrainingArguments(\n", - " output_dir=\"./code_edu\",\n", - " num_train_epochs=1,\n", - " per_device_train_batch_size=batch_size,\n", - " per_device_eval_batch_size=batch_size,\n", - " warmup_steps=500,\n", - " weight_decay=0.01,\n", - " logging_dir=\"./logs\",\n", - " logging_steps=10,\n", - " evaluation_strategy=\"epoch\",\n", - " save_strategy=\"epoch\",\n", - " load_best_model_at_end=True,\n", - " metric_for_best_model=\"accuracy\",\n", - " greater_is_better=True,\n", - " seed=42,\n", - " push_to_hub=True,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "subset = label_dataset(ds, \"content\", labeler, labels, sample=0.001)\n", - "base_model_name = \"bigcode/starencoder\"\n", - "model, tokenizer = train_labeler(\n", - " subset,\n", - " \"content\",\n", - " base_model_name,\n", - " n_labels=len(labels),\n", - " training_args=training_args,\n", - " num_workers=4,\n", - " max_length=512,\n", - " push_to_hub=True,\n", - ")\n", - "pipe = pipeline(\n", - " \"text-classification\", model=model, tokenizer=tokenizer, device=model.device\n", - ")\n", - "filtered_ds = filter_dataset(ds, \"content\", model, labels.index(\"educational\"))\n", - "filtered_ds.push_to_hub(\"ncoop57/code_edu\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "python3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml deleted file mode 100644 index 0a6dfcb..0000000 --- a/nbs/_quarto.yml +++ /dev/null @@ -1,20 +0,0 @@ -project: - type: website - -format: - html: - theme: cosmo - css: styles.css - toc: true - -website: - twitter-card: true - open-graph: true - repo-actions: [issue] - navbar: - background: primary - search: true - sidebar: - style: floating - -metadata-files: [nbdev.yml, sidebar.yml] \ No newline at end of file diff --git a/nbs/index.ipynb b/nbs/index.ipynb deleted file mode 100644 index 5e9fc26..0000000 --- a/nbs/index.ipynb +++ /dev/null @@ -1,96 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | hide\n", - "from treasure_trove.core import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# treasure_trove\n", - "\n", - "> Find the treasure in your trove of data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This file will become your README and also the index of your documentation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Install" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```sh\n", - "pip install treasure_trove\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How to use" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fill me in please! Don't forget code examples:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "1 + 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "python3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/nbs/nbdev.yml b/nbs/nbdev.yml deleted file mode 100644 index 8264f3b..0000000 --- a/nbs/nbdev.yml +++ /dev/null @@ -1,9 +0,0 @@ -project: - output-dir: _docs - -website: - title: "treasure_trove" - site-url: "https://CarperAI.github.io/treasure_trove" - description: "Find the treasure in your trove of data" - repo-branch: main - repo-url: "https://github.com/CarperAI/treasure_trove" diff --git a/nbs/styles.css b/nbs/styles.css deleted file mode 100644 index 66ccc49..0000000 --- a/nbs/styles.css +++ /dev/null @@ -1,37 +0,0 @@ -.cell { - margin-bottom: 1rem; -} - -.cell > .sourceCode { - margin-bottom: 0; -} - -.cell-output > pre { - margin-bottom: 0; -} - -.cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { - margin-left: 0.8rem; - margin-top: 0; - background: none; - border-left: 2px solid lightsalmon; - border-top-left-radius: 0; - border-top-right-radius: 0; -} - -.cell-output > .sourceCode { - border: none; -} - -.cell-output > .sourceCode { - background: none; - margin-top: 0; -} - -div.description { - padding-left: 2px; - padding-top: 5px; - font-style: italic; - font-size: 135%; - opacity: 70%; -} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..89399b9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +accelerate +datasets +evaluate +fastcore +openai +transformers +python-dotenv +pandas +wandb +huggingface_hub \ No newline at end of file diff --git a/settings.ini b/settings.ini deleted file mode 100644 index 3e8da59..0000000 --- a/settings.ini +++ /dev/null @@ -1,43 +0,0 @@ -[DEFAULT] -# All sections below are required unless otherwise specified. -# See https://github.com/fastai/nbdev/blob/master/settings.ini for examples. - -### Python library ### -repo = treasure_trove -lib_name = %(repo)s -version = 0.0.1 -min_python = 3.7 -license = apache2 -black_formatting = False - -### nbdev ### -doc_path = _docs -lib_path = treasure_trove -nbs_path = nbs -recursive = True -tst_flags = notest -put_version_in_init = True - -### Docs ### -branch = main -custom_sidebar = False -doc_host = https://%(user)s.github.io -doc_baseurl = /%(repo)s -git_url = https://github.com/%(user)s/%(repo)s -title = %(lib_name)s - -### PyPI ### -audience = Developers -author = ncoop57 -author_email = nacooper01@email.wm.edu -copyright = 2023 onwards, %(author)s -description = Find the treasure in your trove of data -keywords = nbdev jupyter notebook python -language = English -status = 3 -user = CarperAI - -### Optional ### -requirements = accelerate datasets evaluate fastcore langchain openai squeakily transformers -dev_requirements = black[jupyter] ipykernel -# console_scripts = \ No newline at end of file diff --git a/train_labeler.py b/train_labeler.py new file mode 100644 index 0000000..860c003 --- /dev/null +++ b/train_labeler.py @@ -0,0 +1,156 @@ +from dataclasses import dataclass +from datasets import load_dataset +from transformers import pipeline, TrainingArguments +import evaluate +import numpy as np +import wandb +from dotenv import load_dotenv +from huggingface_hub import login +import os + +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + Trainer, +) + +load_dotenv(".env") + +login(token=os.environ["HF_KEY"], add_to_git_credential=True) + +@dataclass +class EncoderParams: + batch_size = 32 + num_workers = 16 + push_to_hub = True + n_labels = 3 + text_column = "content" + labels = ["high quality", "medium quality", "low quality"] + base_model_name = "bigcode/starencoder" + id2label = {0: "HIGH_QUALITY", 1: "MEDIUM_QUALITY", 2: "LOW_QUALITY"} + label2id = {"HIGH_QUALITY": 0, "MEDIUM_QUALITY": 1, "LOW_QUALITY": 2} + MASK_TOKEN = "" + SEPARATOR_TOKEN = "" + PAD_TOKEN = "" + CLS_TOKEN = "" + max_input_length = 1024 + max_token_length = 1024 + + +def train(): + + dataset = load_dataset("roborovski/phi-2-labeled")["train"] + + tokenizer = AutoTokenizer.from_pretrained( + EncoderParams.base_model_name, max_length=EncoderParams.max_token_length + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForSequenceClassification.from_pretrained( + EncoderParams.base_model_name, + num_labels=EncoderParams.n_labels, + max_length=EncoderParams.max_token_length, + id2label=EncoderParams.id2label, + label2id=EncoderParams.label2id, + ) + + sample_table_data = [] + + def compute_metrics(eval_pred): + logits, labels = eval_pred + if isinstance(logits, tuple): + logits = logits[0] + predictions = np.argmax(logits, axis=-1) + acc = acc_metric.compute(predictions=predictions, references=labels) + precision = precision_metric.compute( + predictions=predictions, + references=labels, + average="macro" if len(labels) > 2 else "binary", + ) + recall = recall_metric.compute( + predictions=predictions, + references=labels, + average="macro" if len(labels) > 2 else "binary", + ) + f1 = f1_metric.compute( + predictions=predictions, + references=labels, + average="macro" if len(labels) > 2 else "binary", + ) + + decoded_sample = tokenizer.decode(predictions) + sample_table_data.append([decoded_sample, labels[0]]) + sample_table = wandb.Table( + columns=["sample", "label"], + data=sample_table_data, + ) + wandb.log({"sample": sample_table}) + + return {**acc, **precision, **recall, **f1} + + dataset = dataset.map( + lambda x: tokenizer( + x[EncoderParams.text_column], + padding="max_length", + truncation=True, + max_length=EncoderParams.max_input_length, + ), + batched=True, + num_proc=EncoderParams.num_workers, + ) + + dataset = dataset.train_test_split(test_size=0.05, seed=42) + + train_dataset = dataset["train"].shuffle(seed=42) + eval_dataset = dataset["test"].shuffle(seed=42).select(range(200)) + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + acc_metric = evaluate.load("accuracy") + precision_metric = evaluate.load("precision") + recall_metric = evaluate.load("recall") + f1_metric = evaluate.load("f1") + + wandb.login() + + wandb.init(project="phi-2-classifier") + + training_args = TrainingArguments( + output_dir="checkpoints", + num_train_epochs=100, + per_device_train_batch_size=EncoderParams.batch_size, + per_device_eval_batch_size=2, + warmup_steps=500, + weight_decay=0.01, + logging_dir="logs", + logging_steps=50, + eval_steps=5000, + evaluation_strategy="steps", + save_strategy="epoch", + save_steps=5, + seed=42, + push_to_hub=True, + hub_model_id="roborovski/phi-2-classifier", + hub_private_repo=True, + eval_accumulation_steps=1, + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) + + trainer.train() + + if EncoderParams.push_to_hub: + trainer.push_to_hub() + + +if __name__ == "__main__": + train() diff --git a/treasure_trove/__init__.py b/treasure_trove/__init__.py index f102a9c..e69de29 100644 --- a/treasure_trove/__init__.py +++ b/treasure_trove/__init__.py @@ -1 +0,0 @@ -__version__ = "0.0.1" diff --git a/treasure_trove/_modidx.py b/treasure_trove/_modidx.py deleted file mode 100644 index 79d02e9..0000000 --- a/treasure_trove/_modidx.py +++ /dev/null @@ -1,11 +0,0 @@ -# Autogenerated by nbdev - -d = { 'settings': { 'branch': 'main', - 'doc_baseurl': '/treasure_trove', - 'doc_host': 'https://CarperAI.github.io', - 'git_url': 'https://github.com/CarperAI/treasure_trove', - 'lib_path': 'treasure_trove'}, - 'syms': { 'treasure_trove.core': { 'treasure_trove.core.classify': ('core.html#classify', 'treasure_trove/core.py'), - 'treasure_trove.core.filter_dataset': ('core.html#filter_dataset', 'treasure_trove/core.py'), - 'treasure_trove.core.label_dataset': ('core.html#label_dataset', 'treasure_trove/core.py'), - 'treasure_trove.core.train_labeler': ('core.html#train_labeler', 'treasure_trove/core.py')}}} diff --git a/treasure_trove/core.py b/treasure_trove/core.py index 0fc06ac..372a8d9 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -1,13 +1,5 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb. - -# %% auto 0 -__all__ = ['classify', 'label_dataset', 'train_labeler', 'filter_dataset'] - -# %% ../nbs/00_core.ipynb 2 -import evaluate -import time - -import numpy as np +import re +import os from transformers import ( AutoModelForSequenceClassification, @@ -15,133 +7,113 @@ DataCollatorWithPadding, Trainer, ) +import time +import openai + +openai.api_key = os.getenv("OPENAI_KEY") + + +from typing import List + +instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. +High quality code has the following: +* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure. +* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects. +* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles. +* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain. +Medium quality code has the following: +* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names. +* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions. +* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose. +* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices. +Low quality code has the following: +* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names. +* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions. +* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose. +* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend. + +Output nothing other than one of the following labels: +{0} +""" + +instruction_simple = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. +High quality code has the following: +* Readability: The code is written in a way that is easy to understand and follow. +* Modularity: The code is organized into reusable and independent modules or functions. +* Detailed explanations: The code is accompanied by explanations of the concepts used. +* Good design principles: The code follows best practices for software design. +Medium quality code has the following: +* Readability: The code is reasonably well-structured and readable. +* Partial modularity: The code contains some reusable components. +* Some explanations: The code may have limited explanations or comments. +* Adequate design principles: The code follows basic design principles. +Low quality code has the following: +* Poor readability: The code is poorly structured and difficult to follow. +* No modularity: The code is written in a monolithic style. +* Limited explanations: The code provides minimal or no explanations. +* Neglects design principles: The code shows a lack of consideration for design principles. + +Output nothing other than one of the following labels: +High quality +Medium quality +Low quality +""" + + + + +class ChatGPTLabeler: + def __init__( + self, + instruction: str, + labels: List[str], + secondary_labels: List[str], + ): + self.instruction = instruction + self.labels = labels + self.secondary_labels = secondary_labels + + def find_label(self, text: str, labels: List[str]): + for i, label in enumerate(labels): + pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE) + match = re.search(pattern, text) + if bool(match): + return i + return None + + def cost_info(self, oai_response): + prompt_tokens = oai_response["usage"]["prompt_tokens"] + completion_tokens = oai_response["usage"]["completion_tokens"] + total_cost = 0.0015 * prompt_tokens + 0.0002 * completion_tokens + + return dict( + total_cost=total_cost, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + + def __call__(self, text: str): + formatted_instruction = instruction.format(self.labels) + completion = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + temperature=0, + max_tokens=4, + messages=[ + {"role": "system", "content": formatted_instruction}, + {"role": "user", "content": text}, + ], + ) + if "error" in completion: + return 0, None + output_text = completion["choices"][0]["message"]["content"] + label_idx = self.find_label(output_text, self.labels) + if not label_idx: + label_idx = self.find_label(output_text, self.secondary_labels) + cost_info = self.cost_info(completion) + if not label_idx: + raise Exception(f"Label not found in text: {output_text}") + return label_idx, cost_info -# %% ../nbs/00_core.ipynb 4 -def classify(x, labels, llm_labeler, max_failures=5, default_label=0): - failures = 0 - while failures < max_failures: - try: - label = labels.index(llm_labeler(x)[0]) - time.sleep(1) - return label - except Exception as e: - failures += 1 - print(e) - time.sleep(1) - pass - if failures == max_failures: - return default_label - -# %% ../nbs/00_core.ipynb 5 -def label_dataset( - dataset, text_column, labeler_model, labels, sample=0.1, num_workers=4, max_chars=4_096 -): - """ - Filters a dataset using a labeler model. - - Args: - dataset (datasets.Dataset): Dataset to filter - text_column (str): Name of the column containing the text to classify - labeler_model (Any): Model to use for labeling - labels (List[str]): List of labels - sample (float): The fraction of the dataset to label and use for filtering - batch_size (int): Batch size for labeling - num_workers (int): Number of workers for labeling - max_chars (int): Maximum number of characters to truncate the text to before labeling (reduces rate limiting errors) - """ - - # Get a subset of the dataset - subset = dataset.shuffle(seed=115).select(range(int(len(dataset) * sample))) - - # Label the subset - subset = subset.map( - lambda x: {"label": classify(x[text_column][:max_chars], labels, labeler_model)}, - batched=False, - num_proc=num_workers, - ) - - return subset - -# %% ../nbs/00_core.ipynb 7 -def train_labeler( - dataset, - text_column, - base_model_name, - n_labels, - training_args, - num_workers=4, - max_length=512, - push_to_hub=True, -): - """ - Trains a labeler model on a labeled dataset. - - Args: - dataset (datasets.Dataset): Dataset to train on - text_column (str): Name of the text column - base_model_name (str): Name of the base model to use - n_labels (int): Number of labels - epochs (int): Number of epochs to train - batch_size (int): Batch size for training - num_workers (int): Number of workers for training - max_length (int): Maximum length of the input - """ - # Load the tokenizer - tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length=max_length) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - # Load the model - model = AutoModelForSequenceClassification.from_pretrained( - base_model_name, num_labels=n_labels, max_length=max_length - ) - model.config.id2label = {i: i for i in range(n_labels)} - - # Preprocess the dataset - dataset = dataset.map( - lambda x: tokenizer( - x[text_column], padding="max_length", truncation=True, max_length=max_length - ), - batched=True, - num_proc=num_workers, - ) - - # Split the dataset - dataset = dataset.train_test_split(test_size=0.1, seed=42) - - # Get the data collator - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - def compute_metrics(eval_preds): - metric = evaluate.load("glue", "mrpc") - logits, labels = eval_preds - if isinstance(logits, tuple): # Some models return tuples - logits = logits[0] - print(logits.shape, labels) - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - # Get the trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=dataset["train"], - eval_dataset=dataset["test"], - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - # Train the model - trainer.train() - - # Push the model to the hub - if push_to_hub: - trainer.push_to_hub() - - # Return the model - return model, tokenizer - -# %% ../nbs/00_core.ipynb 9 def filter_dataset( dataset, text_column, labeler_model, labels_to_keep, batch_size=32, num_workers=4 ): diff --git a/view_dataset.py b/view_dataset.py new file mode 100644 index 0000000..864889f --- /dev/null +++ b/view_dataset.py @@ -0,0 +1,10 @@ +import os +from pathlib import Path +from collections import Counter + +from datasets import load_dataset + +ds = load_dataset("roborovski/phi-1")["train"] +print(ds) +print(Counter(ds['label'])) +print(Counter(ds['language']))