diff --git a/taxonomy/.github/ISSUE_TEMPLATE/bug_report.md b/taxonomy/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..ea010d7d9 --- /dev/null +++ b/taxonomy/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,42 @@ +--- +name: Bug/Problem report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + + + +**Describe the bug/problem** + + + +- ... +- ... +- ... + +**Input given at the prompt** + + + +``` + ... +``` + +**Response that was received from the current model** + + + +``` + ... +``` + +**Response that you expected instead** + + +``` + ... +``` diff --git a/taxonomy/.github/ISSUE_TEMPLATE/proposal.md b/taxonomy/.github/ISSUE_TEMPLATE/proposal.md new file mode 100644 index 000000000..a8917f177 --- /dev/null +++ b/taxonomy/.github/ISSUE_TEMPLATE/proposal.md @@ -0,0 +1,40 @@ +--- +name: Proposal +about: Create a contribution proposal +title: '' +labels: '' +assignees: '' + +--- + +**Describe the proposed contribution to the taxonomy** + + + +- ... +- ... +- ... + +**Input given at the prompt** + + + +``` + ... +``` + +**Response from the current model** + + + +``` + ... +``` + +**Response that you would expect instead with the contribution** + + +``` + ... +``` diff --git a/taxonomy/.github/dependabot.yml b/taxonomy/.github/dependabot.yml new file mode 100644 index 000000000..05fef9964 --- /dev/null +++ b/taxonomy/.github/dependabot.yml @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 + +# GitHub Dependabot configuration file +version: 2 +updates: + + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" + - package-ecosystem: "docker" + directory: "/.github/workflows" + schedule: + interval: "daily" + + # Maintain dependencies for Python scripts + - package-ecosystem: "pip" + directory: "/scripts" + schedule: + interval: "daily" diff --git a/taxonomy/.github/labeler.yml b/taxonomy/.github/labeler.yml new file mode 100644 index 000000000..5a1e64933 --- /dev/null +++ b/taxonomy/.github/labeler.yml @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: Apache-2.0 + +ci: + - changed-files: + - any-glob-to-any-file: + - scripts/** + - .github/workflows/** + - .github/*.yml + +documentation: + - changed-files: + - any-glob-to-any-file: + - "*.md" + - docs/** + +knowledge: + - changed-files: + - any-glob-to-any-file: + - knowledge/** + +skill: + - changed-files: + - any-glob-to-any-file: + - compositional_skills/** + +triage-needed: + - changed-files: + - any-glob-to-any-file: + - compositional_skills/** + - knowledge/** diff --git a/taxonomy/.github/pull_request_template.md b/taxonomy/.github/pull_request_template.md new file mode 100644 index 000000000..63e163b3f --- /dev/null +++ b/taxonomy/.github/pull_request_template.md @@ -0,0 +1,57 @@ +If your PR is related to a contribution to the taxonomy, please, fill +out the following questionnaire. If not, replace this whole text and the +following questionnaire with whatever information is applicable to your PR. + + +**Describe the contribution to the taxonomy** + + + +- ... +- ... +- ... + + +**Input given at the prompt** + + + +``` + ... +``` + + +**Response from the original model** + + + + +``` + ... +``` + + +**Response from the fine-tuned model** + + + + +``` + ... +``` + +**Contribution checklist** + + + +- [ ] The contribution was tested with `ilab generate` +- [ ] No errors or warnings were produced by `ilab generate` +- [ ] All [commits are signed off](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#legal) (DCO) +- [ ] The `qna.yaml` file contains at least 5 `seed_examples` +- [ ] The `qna.yaml` file was [linted](https://yamllint.com) and [prettified](https://onlineyamltools.com/prettify-yaml) ([yaml-validator](https://jsonformatter.org/yaml-validator) can do both) +- [ ] An `attribution.txt` file in the same folder as the `qna.yaml` file +- [ ] Content does not include PII or otherwise sensitive or confidential information +- [ ] Content does not include anything documented in the project's [Avoid these Topics](https://github.com/instructlab/taxonomy/blob/main/docs/SKILLS_GUIDE.md#avoid-these-topics) guidelines diff --git a/taxonomy/.github/workflows/actionlint.dockerfile b/taxonomy/.github/workflows/actionlint.dockerfile new file mode 100644 index 000000000..d4df32d2c --- /dev/null +++ b/taxonomy/.github/workflows/actionlint.dockerfile @@ -0,0 +1,3 @@ +# Since dependabot cannot update workflows using docker, +# we use this indirection since dependabot can update this file. +FROM rhysd/actionlint:1.7.7@sha256:887a259a5a534f3c4f36cb02dca341673c6089431057242cdc931e9f133147e9 diff --git a/taxonomy/.github/workflows/actionlint.yml b/taxonomy/.github/workflows/actionlint.yml new file mode 100644 index 000000000..134e8a70b --- /dev/null +++ b/taxonomy/.github/workflows/actionlint.yml @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Lint GitHub Actions workflows +on: + push: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' # This workflow + pull_request: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' # This workflow + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Download actionlint" + run: | + docker build --tag actionlint - < .github/workflows/actionlint.dockerfile + + - name: "Check workflow files" + run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" + docker run --volume="${PWD}:/repo" --workdir=/repo actionlint -color diff --git a/taxonomy/.github/workflows/docs.yml b/taxonomy/.github/workflows/docs.yml new file mode 100644 index 000000000..ae89a654b --- /dev/null +++ b/taxonomy/.github/workflows/docs.yml @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Lint Markdown documents + +on: + push: + branches: + - "main" + paths: + - '**/*.md' + - '.markdownlint-cli2.yaml' + - '.github/workflows/docs.yml' # This workflow + pull_request: + branches: + - "main" + paths: + - '**/*.md' + - '.markdownlint-cli2.yaml' + - '.github/workflows/docs.yml' # This workflow + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + markdown-lint: + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + - name: "Check Markdown documents" + uses: DavidAnson/markdownlint-cli2-action@05f32210e84442804257b2a6f20b273450ec8265 # v19.1.0 + with: + globs: '**/*.md' diff --git a/taxonomy/.github/workflows/insight.yml b/taxonomy/.github/workflows/insight.yml new file mode 100644 index 000000000..5c0d410c2 --- /dev/null +++ b/taxonomy/.github/workflows/insight.yml @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: CoC insight analysis + +run-name: ${{ github.event.workflow_run.display_title || github.workflow }} + +on: + workflow_run: + workflows: + - "Lint compositional_skills and knowledge" + types: + - completed + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + insight-analysis: + runs-on: ubuntu-latest + if: ${{ (github.event.workflow_run.event == 'pull_request') && (github.repository == 'instructlab/taxonomy') && (github.event.workflow_run.conclusion == 'success') }} + name: Run insight analysis + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Debug info" + run: | + jq '.' "$GITHUB_EVENT_PATH" + + - name: "Download Pull Request number" + uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9 + with: + name: pull_request_number + run-id: ${{ github.event.workflow_run.id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: "Get Pull Request number" + run: | + echo "PULL_REQUEST_NUMBER=$(cat pull_request_number.txt)" >> "$GITHUB_ENV" + + - name: "Invoke insight analysis handler" + if: ${{ fromJSON(env.PULL_REQUEST_NUMBER) != 0 }} + run: | + curl -X "POST" "https://pr-analysis-handler.1fuhf5gskmng.us-east.codeengine.appdomain.cloud/analyze-pr?pr_number=${PULL_REQUEST_NUMBER}&owner=${REPOSITORY_OWNER}&repo=${REPOSITORY_NAME}" -H "accept: application/json" -H "Authorization: Bearer ${ANALYSIS_TOKEN}" + env: + REPOSITORY_OWNER: ${{ github.repository_owner }} + REPOSITORY_NAME: ${{ github.event.repository.name }} + ANALYSIS_TOKEN: ${{ secrets.COC_ANALYSIS_TOKEN }} diff --git a/taxonomy/.github/workflows/label.yml b/taxonomy/.github/workflows/label.yml new file mode 100644 index 000000000..b89826780 --- /dev/null +++ b/taxonomy/.github/workflows/label.yml @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: "Pull Request Labeler" + +on: + pull_request_target: + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + labeler: + permissions: + pull-requests: write + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Labeler Action" + uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + sync-labels: false + + add-comment: + if: ${{ (github.event.action == 'labeled') && (github.event.label.name == 'triage-uncertain') }} + permissions: + pull-requests: write + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Add comment" + run: | + gh pr comment "${PULL_REQUEST_NUMBER}" --body "${PULL_REQUEST_COMMENT}" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_REPO: ${{ github.repository }} + PULL_REQUEST_NUMBER: ${{ github.event.number }} + PULL_REQUEST_COMMENT: > + Hi! So you've been marked as `triage-uncertain`, don't fret! + This just means the triage team or member needs to discuss your PR during one of our [public meetings](https://github.com/instructlab/community/blob/main/Collaboration.md#triager-standup) before making a decision. (Yes! You are more then welcome to join us.) + You have either made something amazing, or maybe hit a corner case that we hadn't thought of, or something like that. + :star2: Thank you for your contribution! And you are pushing our :dog: :computer: project and we can't thank you enough. :sparkles: diff --git a/taxonomy/.github/workflows/lint.yml b/taxonomy/.github/workflows/lint.yml new file mode 100644 index 000000000..c3a5cf2f2 --- /dev/null +++ b/taxonomy/.github/workflows/lint.yml @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Lint compositional_skills and knowledge + +on: + workflow_dispatch: + push: + branches: + - main + paths: + - compositional_skills/**/qna.yaml + - knowledge/**/qna.yaml + - '.github/workflows/lint.yml' # This workflow + - 'scripts/**' # Scripts used by this workflow + + pull_request: + branches: + - main + paths: + - compositional_skills/**/*.yaml + - compositional_skills/**/*.yml + - knowledge/**/*.yaml + - knowledge/**/*.yml + - '.github/workflows/lint.yml' # This workflow + - 'scripts/**' # Scripts used by this workflow + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + yamllint: + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Setup Python" + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + with: + python-version: "3.11" + + - name: "Install Python Packages" + run: | + pip install -r scripts/requirements.txt + + - name: "Find changed skills and knowledge files" + id: changed-files + uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8 # v45.0.7 + with: + files: | + compositional_skills/**/*.yaml + compositional_skills/**/*.yml + knowledge/**/*.yaml + knowledge/**/*.yml + + - name: "Check changed YAML file contents" + if: ${{ fromJSON(steps.changed-files.outputs.any_changed) }} + run: | + scripts/check-yaml.py ${{ steps.changed-files.outputs.all_changed_files }} + env: + TAXONOMY_FOLDERS: >- + compositional_skills + knowledge + + - name: "Check all YAML file contents" + if: ${{ !fromJSON(steps.changed-files.outputs.any_changed) }} + run: | + read -ra folders <<< "${TAXONOMY_FOLDERS}" + # shellcheck disable=SC2046 + scripts/check-yaml.py $(find "${folders[@]}" -name "qna.yaml" -print) + env: + SCHEMA_VERSION: 0 # use the schema version specified in the "version" key + TAXONOMY_FOLDERS: >- + compositional_skills + knowledge + + - name: "Save Pull Request number" + if: ${{ (github.event_name == 'pull_request') && (github.repository == 'instructlab/taxonomy') }} + run: | + echo "${PULL_REQUEST_NUMBER}" > pull_request_number.txt + env: + PULL_REQUEST_NUMBER: ${{ fromJSON(steps.changed-files.outputs.any_changed) && github.event.number || '0' }} + + - name: "Upload Pull Request number" + if: ${{ (github.event_name == 'pull_request') && (github.repository == 'instructlab/taxonomy') }} + uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 + with: + name: pull_request_number + path: pull_request_number.txt + if-no-files-found: error + retention-days: 5 diff --git a/taxonomy/.github/workflows/matchers/actionlint.json b/taxonomy/.github/workflows/matchers/actionlint.json new file mode 100644 index 000000000..4613e1617 --- /dev/null +++ b/taxonomy/.github/workflows/matchers/actionlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "actionlint", + "pattern": [ + { + "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/taxonomy/.github/workflows/stale_bot.yml b/taxonomy/.github/workflows/stale_bot.yml new file mode 100644 index 000000000..ddd9f2775 --- /dev/null +++ b/taxonomy/.github/workflows/stale_bot.yml @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: 'Close stale issues and PRs' +on: + schedule: + - cron: '30 1 * * *' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + stale: + permissions: + issues: write + pull-requests: write + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Stale Action" + uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 + with: + stale-issue-label: 'stale' + stale-issue-message: > + This issue has been automatically marked as stale because it has not had activity within 30 days. + It will be automatically closed if no further activity occurs within the next 31 days. + close-issue-message: > + This issue has been automatically closed due to inactivity. + days-before-issue-stale: 30 + days-before-issue-close: 31 + exempt-issue-labels: good-first-issue + stale-pr-label: 'stale' + stale-pr-message: > + This pull request has been automatically marked as stale because it has not had activity within 15 days. + It will be automatically closed if no further activity occurs within the next 31 days. + close-pr-message: > + This pull request has been automatically closed due to inactivity. + days-before-pr-stale: 15 + days-before-pr-close: 31 + exempt-pr-labels: community-build-ready diff --git a/taxonomy/.github/workflows/update-CoC-report.yml b/taxonomy/.github/workflows/update-CoC-report.yml new file mode 100644 index 000000000..a445c8a01 --- /dev/null +++ b/taxonomy/.github/workflows/update-CoC-report.yml @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Update CoC report + +run-name: "${{ github.event.pull_request.title }} (#${{ github.event.pull_request.number}})" +on: + pull_request_target: + types: + - closed + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + update-coc-insight-handler: + runs-on: ubuntu-latest + if: ${{ github.repository == 'instructlab/taxonomy' }} + name: Update CoC insights handler for PR Close event + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Debug info" + run: | + jq '.' "$GITHUB_EVENT_PATH" + + - name: "Update CoC insight analysis handler" + run: | + curl -X "GET" "https://pr-analysis-handler.1fuhf5gskmng.us-east.codeengine.appdomain.cloud/pr-closed?pr_number=${PULL_REQUEST_NUMBER}&owner=${REPOSITORY_OWNER}&repo=${REPOSITORY_NAME}&merged=${IS_MERGED}" -H "accept: application/json" -H "Authorization: Bearer ${ANALYSIS_TOKEN}" + env: + REPOSITORY_OWNER: ${{ github.repository_owner }} + REPOSITORY_NAME: ${{ github.event.repository.name }} + PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number}} + IS_MERGED: ${{ github.event.pull_request.merged }} + ANALYSIS_TOKEN: ${{ secrets.COC_ANALYSIS_TOKEN }} diff --git a/taxonomy/.gitignore b/taxonomy/.gitignore new file mode 100644 index 000000000..5442ffd5a --- /dev/null +++ b/taxonomy/.gitignore @@ -0,0 +1,141 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +scratch.ipynb + +# IDEs +.vscode/ +.idea/ + +# Mac personalization files +.DS_Store + +# Ignore config.yaml from the cli +config.yaml diff --git a/taxonomy/.markdownlint-cli2.yaml b/taxonomy/.markdownlint-cli2.yaml new file mode 100644 index 000000000..7f06c29ca --- /dev/null +++ b/taxonomy/.markdownlint-cli2.yaml @@ -0,0 +1,16 @@ +config: + line-length: false + no-emphasis-as-header: false + first-line-heading: false + code-block-style: false + no-duplicate-header: false + single-trailing-newline: false + no-bare-urls: false +globs: + - "**/*.md" +ignores: + - ".github/**" + - ".tox/**" + - "venv/**" + - ".venv/**" + - "knowledge/knowledge_domains.md" diff --git a/taxonomy/CODE_OF_CONDUCT.md b/taxonomy/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..aacad6834 --- /dev/null +++ b/taxonomy/CODE_OF_CONDUCT.md @@ -0,0 +1,3 @@ +# InstructLab/taxonomy - Code of Conduct and Covenant + +This project adheres to the [InstructLab - Code of Conduct and Covenant](https://github.com/instructlab/community/blob/main/CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. diff --git a/taxonomy/CONTRIBUTING.md b/taxonomy/CONTRIBUTING.md new file mode 100644 index 000000000..5018dc377 --- /dev/null +++ b/taxonomy/CONTRIBUTING.md @@ -0,0 +1,169 @@ +# Contributing + +πŸ‘πŸŽ‰ First off, thank you for taking the time to contribute! πŸŽ‰πŸ‘ + +The following is a set of guidelines for contributing. These are just guidelines, not rules. Use your best judgment, and feel free to propose changes to this document in a pull request. Please read the [Community Contribution Guide](https://github.com/instructlab/community/blob/main/CONTRIBUTING.md) first for general practices for the InstructLab community. + +## What Should I Know Before I Get Started? + +### Code of Conduct + +This project adheres to the [Contributor Covenant](https://github.com/instructlab/community/blob/main/CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. + +Please report unacceptable behavior to one of the [Maintainers](https://github.com/instructlab/community/blob/main/MAINTAINERS.md). + +### Related repositories + +In addition to this repository, InstructLab has two related repositories: + +- [CLI](https://github.com/instructlab/instructlab). This repository is responsible for the the `ilab` command-line interface (CLI) tool. +- [Community](https://github.com/instructlab/community). This repository is responsible for showing collaboration details across the InstructLab community. + +The following sections provide a general overview for contributing to the Taxonomy repository. + +## Ways of contributing to the taxonomy repository + +### Contributing skills and knowledge + +You can contribute to the taxonomy in the following two ways: + +1. Adding new examples to **existing leaf nodes**: + + - Go to the corresponding leaf node / end of the branch and modify the YAML + - Add a new example to the `qna.yaml` files as a new entry to the list + +2. Adding **new branches/skills** corresponding to the existing domain: + + - You can add new folders under the corresponding category (replace any spaces with underscores `_`) + - Create a new `qna.yaml` file containing examples for the new skill + +A detailed contribution guide is documented in the [How can I contribute section](#how-can-i-contribute) section. + +### Contributing new features, enhancements or documentation + +Help on open source projects is always welcome and there is always something that can be improved. For example, documentation (like the text you are reading now) can always use improvement, code can always be clarified, variables or functions can always be renamed or commented on, and there is always a need for more test coverage. If you see something that you think should be fixed, take ownership! Here is how you get started: + +To propose a new feature, it's best to raise an issue in the appropriate repository. This way, features can be discussed with the project maintainers, ensuring that your time is not wasted working on a feature that the project developers will not accept into the codebase. + +*How Do I Submit A (Good) Enhancement or Improvement item?:* Enhancements and improvement items suggestions are tracked as [GitHub issues: Proposal](https://github.com/instructlab/taxonomy/issues/new?assignees=&labels=&projects=&template=proposal.md&title=). Create an issue and provide the information suggested in the proposal template. + +## How do I start contributing? + +The following workflow is designed to help you begin your first contribution journey. It will guide you through creating and picking up issues, working through them, having your work reviewed, and then merging. + +### How Can I Contribute? + +The goal of InstructLab is to enable true collaborative development around common Large Language Models (LLMs) using a technology that enables collaboration following standard open source development practices. A general overview of making a contribution to this project consists of writing an extension to the existing taxonomy, making a pull request, and getting your work reviewed and merged so that it can benefit the whole community. + +Before you start, review the [open issues](https://github.com/instructlab/taxonomy/issues) and [opened pull requests](https://github.com/instructlab/taxonomy/pulls) board to see if your contribution or enhancements are already proposed. You might instead be able to join forces with them by providing input to what they have started. If you are unsure about what kind of skill or knowledge to contribute, you can [open an issue](https://github.com/instructlab/taxonomy/issues/new?assignees=&labels=&template=proposal.md&title=) first to discuss your proposal idea with the maintainers. + +To contribute to this repo, you'll use the *Fork and Pull* model common in many open source repositories. You can follow this process in a local terminal or in the GitHub web UI. + +- For details on the local process, check out the [GitHub flow](https://docs.github.com/en/get-started/using-github/github-flow) documentation from GitHub and [The GitHub Workflow Guide](https://github.com/kubernetes/community/blob/master/contributors/guide/github-workflow.md) documentation from Kubernetes. +- For details on contributing using the GitHub webpage UI, see [Contributing using the GH UI](docs/contributing_via_GH_UI.md). + +> [!IMPORTANT] +> For all contributions to InstructLab 🐢, you want to become familiar with the workflow described in the [InstructLab 🐢 CLI +> `ilab`](https://github.com/instructlab/instructlab) documentation. It would be best to understand how to test +> your changes, generating new data, training and downloading the new model, and testing it to check that it gives you the desired results. + +When your contribution is ready, you can create a pull request (PR). In general, we follow the standard [GitHub pull request](https://help.github.com/en/articles/about-pull-requests) process. Follow the template to provide details about your pull request to the maintainers. Before submitting pull requests, make sure your changes pass applicable formatting tests in the repository CI. + +>[!NOTE] +> Always refer to the [README.md](https://github.com/instructlab/taxonomy/blob/main/README.md) +> if you are unsure on how to format your contributions. +> +### What can I contribute? + +You can contribute [Knowledge](https://github.com/instructlab/taxonomy/blob/main/README.md#getting-started-with-knowledge-contributions) or [Skills](https://github.com/instructlab/taxonomy/blob/main/README.md#getting-started-with-skill-contributions) to the taxonomy tree. Make sure to review the [general documentation](README.md) for a detail explanation of these concepts and the differences between the two. + +> [!IMPORTANT] +> You can only contribute original material. **DO NOT** contribute copyrighted content or content coming from another system. + +### Submitting your contribution + +When submitting your PR, give it a title which is as explicit as possible. Include in the description of the PR on GitHub, both what the system gave you before your contribution and what it gives you with your contribution. + +### Pull request review + +Once you've [created a pull request](#how-can-i-contribute), maintainers will review your proposed addition and may make suggestions to fix before merging. It will be easier for your pull request to receive reviews if you consider the criteria the reviewers follow while working. Remember to: + +- Run tests locally and ensure that they pass +- Ensure your contribution is in the proper format (`ilab generate` shouldn't report any warnings or errors) +- Break large changes into a logical series of smaller patches, which are easy to understand individually and combine to solve a broader issue +- Follow the project coding conventions +- Include the DCO sign off; see [Legal](#legal) + +The project maintainers use `LGTM` (Looks Good To Me) in comments on the code review to indicate acceptance. You can see more information on the triaging process in the [Triaging skills](https://github.com/instructlab/taxonomy/blob/main/docs/triaging/triaging-contributions.md) documentation. + +For a list of the maintainers and triagers, see the [MAINTAINERS.md](https://github.com/instructlab/community/blob/main/MAINTAINERS.md) page. + +## Submitting bugs + +To submit a new bug, raise an issue in the appropriate repository before creating a pull request. This ensures that the issue is properly tracked. To fix an existing bug, assign yourself a bug from the issues page of the desired repository. Then, submit a pull request for review. + +Bugs are tracked as [GitHub issues using the Bug Report template](https://github.com/instructlab/taxonomy/issues/new?assignees=&labels=&template=bug_report.md&title=). Create an issue on that and provide the information suggested in the bug report issue template. + +## Legal + +We have tried to make it as easy as possible to make contributions. +This applies to how we handle the legal aspects of contribution. +We use the same approach - the [Developer's Certificate of Origin 1.1 (DCO)][DCO] - that [the Linux Kernel community uses][Linux-DCO] to manage code contributions. All contributions that leverage third-party content should either come from the public domain or be licensed with an open data license that does not restrict commercial use or the creation of derivative works, including the following license types: + +- CC0-1.0 +- CDLA-Permissive-2.0 +- CC-BY-4.0 +- CC-BY-SA-4.0 +- Apache-2.0 +- MIT + +Any third-party content contributed to this project undergoes modifications in order to formulate it in the templated format required for submission to this project. + +We simply ask that when submitting a patch for review, the developer must include a sign-off statement in the commit message. + +Here is an example `Signed-off-by` line, which indicates that the submitter accepts the DCO: + +```text +Signed-off-by: John Doe +``` + +You can include this automatically when you commit a change to your local Git repository using the following command: + +```shell +git commit -s +``` + +> [!TIP] +> If you created a commit message that did not include the `-s` option, you can edit your original commit message by using the `git commit -s --amend` command. Ensure you force push the amended commit to your pull request (PR). + +### License + +Unless specifically stated, this project is +distributed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). + +SPDX-License-Identifier: [Apache-2.0](https://spdx.org/licenses/Apache-2.0) + +For more details, see the [LICENSE](LICENSE). + +### For your attribution.txt file + +An important part of contributing to the InstructLab project is citing your sources of information. This comes in the form of your `attribution.txt` that you add to the pull requests. Almost all instances of attribution can be covered by the parameters required for Creative Commons Attribution licenses. Some parameters are as follows: + +- Title of work +- Link to work +- Include link to a specific revision where possible +- License of the work +- Include an SPDX identifier where possible +- Creator names +- Copyright information +- Modification information +- Indicate if work was itself derived from another openly licensed work + +You can also see this citation style in the [Data sources documentation](https://github.com/instructlab/community/blob/main/docs/DataSources.md) + +## Development + +Please consult the [`ilab` documentation](https://github.com/instructlab/instructlab) to set up your environment. + +[DCO]: https://developercertificate.org/ +[Linux-DCO]: https://docs.kernel.org/process/submitting-patches.html#sign-your-work-the-developer-s-certificate-of-origin diff --git a/taxonomy/CONTRIBUTOR_ROLES.md b/taxonomy/CONTRIBUTOR_ROLES.md new file mode 100644 index 000000000..5cafbef46 --- /dev/null +++ b/taxonomy/CONTRIBUTOR_ROLES.md @@ -0,0 +1,3 @@ +# Contributor roles for InstructLab + +For information about contributor roles, see [Contributor roles for InstructLab](https://github.com/instructlab/community/blob/main/CONTRIBUTOR_ROLES.md). \ No newline at end of file diff --git a/taxonomy/LICENSE b/taxonomy/LICENSE new file mode 100644 index 000000000..6b0b1270f --- /dev/null +++ b/taxonomy/LICENSE @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/taxonomy/MAINTAINERS.md b/taxonomy/MAINTAINERS.md new file mode 100644 index 000000000..bb624bcd2 --- /dev/null +++ b/taxonomy/MAINTAINERS.md @@ -0,0 +1 @@ +For a complete list of InstructLab project maintainers, see [Maintainers](https://github.com/instructlab/community/blob/main/MAINTAINERS.md). \ No newline at end of file diff --git a/taxonomy/Makefile b/taxonomy/Makefile new file mode 100644 index 000000000..c33811f62 --- /dev/null +++ b/taxonomy/Makefile @@ -0,0 +1,22 @@ +.PHONY: help +help: + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-18s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +# +# If you want to see the full commands, run: +# NOISY_BUILD=y make +# +ifeq ($(NOISY_BUILD),) + ECHO_PREFIX=@ + CMD_PREFIX=@ + PIPE_DEV_NULL=> /dev/null 2> /dev/null +else + ECHO_PREFIX=@\# + CMD_PREFIX= + PIPE_DEV_NULL= +endif + +.PHONY: md-lint +md-lint: ## Lint markdown files + $(ECHO_PREFIX) printf " %-12s ./...\n" "[MD LINT]" + $(CMD_PREFIX) podman run --rm -v $(CURDIR):/workdir --security-opt label=disable docker.io/davidanson/markdownlint-cli2:latest > /dev/null diff --git a/taxonomy/README.md b/taxonomy/README.md new file mode 100644 index 000000000..74e4ce178 --- /dev/null +++ b/taxonomy/README.md @@ -0,0 +1,644 @@ +# InstructLab 🐢 Taxonomy + +## Contents πŸ“– + +- [Welcome to the InstructLab Taxonomy](#welcome-to-the-instructlab-taxonomy) +- [Learning](#learning) +- [Getting Started with Skill Contributions](#getting-started-with-skill-contributions) + - [Skills: YAML examples](#skills-yaml-examples) +- [Getting Started with Knowledge Contributions](#getting-started-with-knowledge-contributions) + - [Knowledge: YAML examples](#knowledge-yaml-examples) +- [Taxonomy tree layout](#taxonomy-tree-layout) +- [Contribute knowledge and skills to the taxonomy!](#contribute-knowledge-and-skills-to-the-taxonomy) + - [Ways to contribute](#ways-to-contribute) + - [How to contribute skills and knowledge](#how-to-contribute-skills-and-knowledge) + +## Welcome to the InstructLab Taxonomy + +InstructLab 🐢 uses a novel synthetic data-based alignment tuning method for +Large Language Models (LLMs.) The "**lab**" in Instruct**Lab** 🐢 stands for +[**L**arge-Scale **A**lignment for Chat**B**ots](https://arxiv.org/abs/2403.01081) [1]. + +The LAB method is driven by taxonomies, which are largely created manually and +with care. + +This repository contains a taxonomy tree that allows you to create models +tuned with your data (enhanced via synthetic data generation) using the LAB 🐢 +method. + +[1] Shivchander Sudalairaj*, Abhishek Bhandwaldar*, Aldo Pareja*, Kai Xu, David D. Cox, Akash Srivastava*. "LAB: Large-Scale Alignment for ChatBots", arXiv preprint arXiv: 2403.01081, 2024. (* denotes equal contributions) + +## Choosing domains for the taxonomy + +In general, we use the Dewey Decimal Classification (DDC) System to determine our domains (and subdomains) in the taxonomy. This [DDC SUMMARIES document](https://www.oclc.org/content/dam/oclc/dewey/resources/summaries/deweysummaries.pdf) is a great resource for determining where a topic might be classified. + +If you are unsure where to put your knowledge or compositional skill, create a folder in the `miscellaneous_unknown` folder under the `knowledge` or `compositional_skills` folders. + +## Learning + +Learn about the concepts of "skills" and "knowledge" in our [InstructLab Community Learning Guide](https://github.com/instructlab/community/blob/main/docs/README.md). + +## Getting Started with Skill Contributions + +Skills require a much smaller volume of content than knowledge contributions. An entire skill contribution to the taxonomy tree can be just a few lines of YAML in the `qna.yaml` file ("qna" is short for "questions and answers") and an `attribution.txt` file for citing sources. + +Your skills contribution pull requests must include the following: + +- A `qna.yaml` that contains a set of key/value entries with the following keys + - Each `qna.yaml` file requires a minimum of five question and answer pairs. +- An `attribution.txt` that includes the sources for the information used in the `qna.yaml` + +> [!TIP] +> The skill taxonomy structure is used in several ways: +> +> 1. To select the right subset of the taxonomy to use for data generation. +> 2. To determine the interpretability by human contributors and maintainers. +> 3. As part of the prompt to the LLM used to generate synthetic samples. + +> [!IMPORTANT] +> There is a limit to how much content can exist in the question/answer pairs for the model to process. Due to this, only add a maximum +> of around 2300 words to your question and answer seed example pairs in the `qna.yaml` file. + +Compositional skills can either be grounded (includes a context) or ungrounded (does not include a context). Grounded or ungrounded is declared in the taxonomy tree, for example: `linguistics/writing/poetry/haiku/` (ungrounded) or `grounded/linguistics/grammar` (grounded). The `qna.yaml` is in the final node. + +Taxonomy skill files must be a valid [YAML](https://yaml.org/) file named `qna.yaml`. Each `qna.yaml` file contains a set of key/value entries with the following keys: + +- `version`: The value must be the number 2. **Required** +- `task_description`: A description of the skill. **Required** +- `created_by`: The GitHub username of the contributor. **Required** +- `seed_examples`: A collection of key/value entries. New + submissions should have at least five entries, although + older files may have fewer. **Required** + - `context`: Grounded skills require the user to provide context containing information that the model is expected to take into account during processing. This is different from knowledge, where the model is expected to gain facts and background knowledge from the tuning process. The context key should not be used for ungrounded skills. + - `question`: A question for the model. **Required** + - `answer`: The desired response from the model. **Required** + +Other keys at any level are currently ignored. + +### Skills: YAML examples + +To make the `qna.yaml` files easier and faster for humans to read, it is recommended to specify `version` first, followed by `task_description`, then `created_by`, and finally `seed_examples`. +In `seed_examples`, it is recommended to specify `context` first (if applicable), followed by `question` and `answer`. + +*Example `qna.yaml`* + +```yaml +version: 2 +task_description: +created_by: +seed_examples: + - question: + answer: | + + - context: | + + question: + answer: | + + ... +``` + +Then, you create an `attribution.txt` file that includes the sources of your information. These can also be self authored sources. + +*Example `attribution.txt`* + +```text +[Link to source] +[Link to work] +[License of the work] +[Creator name] +``` + +For more information on what to include in your `attribution.txt` file, see [For your attribution.txt file](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#for-your-attributiontxt-file) in CONTRIBUTING.md. + +If you have not written YAML before, don't be intimidated - it's just text. + +> [!TIP] +> +> - Spaces and indentation matter in YAML. Two spaces to indent. +> - Don't use tabs! +> - Be careful to not have trailing spaces at the end of a line. +> - Each example in `seed_examples` begins with a "-". Place this "-" in + front of the first field (`question` or `context`). The remaining keys in the + example should not have this "-". +> - Some special characters such as " and ' need to be escaped with backslash. This is why some + of the lines for keys in the example YAML start the value with the '|' character followed a new line and then an indented multi-line string. + This character disables all of the special characters in the value for the key. + You might also want to use the '|' character for multi-line strings. +> - Consider quoting all values with " to avoid surprising YAML parser behavior + (e.g. Yes answer can be interpreted by the parser as a boolean of `True` + value, unless "Yes" is quoted.) +> - See https://yaml-multiline.info/ for more info. + +It is recommended that you **lint**, or verify, your YAML using a tool. One linter option is [yamllint.com](https://yamllint.com). You can copy/paste your YAML into the box and click **Go** to have it analyze your YAML and make recommendations. Online tools like [prettified](https://onlineyamltools.com/prettify-yaml) and [yaml-validator](https://jsonformatter.org/yaml-validator) can automatically reformat your YAML to adhere to our `yamllint` PR checks, such as breaking lines longer than 120 characters. + +#### Ungrounded compositional skill: YAML example + +```yaml +version: 2 +task_description: 'Teach the model how to rhyme.' +created_by: juliadenham +seed_examples: + - question: What are 5 words that rhyme with horn? + answer: warn, torn, born, thorn, and corn. + - question: What are 5 words that rhyme with cat? + answer: bat, gnat, rat, vat, and mat. + - question: What are 5 words that rhyme with poor? + answer: door, shore, core, bore, and tore. + - question: What are 5 words that rhyme with bank? + answer: tank, rank, prank, sank, and drank. + - question: What are 5 words that rhyme with bake? + answer: wake, lake, steak, make, and quake. +``` + +Seriously, that's it. + +Here is the location of this YAML in the taxonomy tree. Note that the YAML file +itself, plus any added directories that contain the file, is the entirety of the skill +in terms of a taxonomy contribution: + +#### Ungrounded compositional skill: Directory tree example + +```ascii +[...] + +└── writing + └── poetry + | └── haiku <=== here it is :) + | | └── qna.yaml + | | attribution.txt + [...] + └── prose + | └── debate + | | └── qna.yaml + | | attribution.txt + [...] + +[...] +``` + +#### Grounded compositional skill: YAML example + +Remember that [grounded compositional skills](docs/SKILLS_GUIDE.md) require additional context and include a `context` field. + +This example snippet assumes the GitHub username `mairin` and shows some of the question/answer pairs present in the actual file: + +```yaml +version: 2 +task_description: | + This skill provides the ability to read a markdown-formatted table. +created_by: mairin # Use your GitHub username; only one creator supported +seed_examples: + - context: | + | **Breed** | **Size** | **Barking** | **Energy** | + |----------------|--------------|-------------|------------| + | Afghan Hound | 25-27 in | 3/5 | 4/5 | + | Labrador | 22.5-24.5 in | 3/5 | 5/5 | + | Cocker Spaniel | 14.5-15.5 in | 3/5 | 4/5 | + | Poodle (Toy) | <= 10 in | 4/5 | 4/5 | + question: | + Which breed has the most energy? + answer: | + The breed with the most energy is the Labrador. + - context: | + | **Name** | **Date** | **Color** | **Letter** | **Number** | + |----------|----------|-----------|------------|------------| + | George | Mar 5 | Green | A | 1 | + | GrΓ‘inne | Dec 31 | Red | B | 2 | + | Abigail | Jan 17 | Yellow | C | 3 | + | Bhavna | Apr 29 | Purple | D | 4 | + | RΓ©my | Sep 9 | Blue | E | 5 | + question: | + What is GrΓ‘inne's letter and what is her color? + answer: | + GrΓ‘inne's letter is B and her color is red. + - context: | + | Banana | Apple | Blueberry | Strawberry | + |--------|------------|-----------|------------| + | Yellow | Red, Green | Blue | Red | + | Large | Medium | Small | Small | + | Peel | Peel | No peel | No peel | + question: | + Which fruit is blue, small, and has no peel? + answer: | + The blueberry is blue, small, and has no peel. +``` + +#### Grounded compositional skill: Directory tree example + +```ascii +[...] + +grounded +└── technology + └── machine_learning + └── natural_language_processing + | | └── information_extraction + | └── inference + | | └── qualitative + | | β”œβ”€β”€ sentiment + | | | └── qna.yaml + | | | attribution.txt + β”‚ β”œβ”€β”€ quantitative + β”‚ β”‚ β”œβ”€β”€ table_analysis <=== here it is :) + β”‚ | | | └── qna.yaml + β”‚ β”‚ β”‚ | attribution.txt + +[...] +``` + +## Getting Started with Knowledge Contributions + +While skills are foundational or performative, knowledge is based more on answering questions that involve facts, +data, or references. + +Knowledge is supported by documents, such as a textbook, technical manual, encyclopedia, journal, or magazine. + +Knowledge in the taxonomy tree consists of a few more elements than skills: + +> [!IMPORTANT] +> If you are using InstructLab version `0.21.0` or above, you can specify PDF files in your knowledge `qna.yaml` file as a valid document type. Any previous version of InstructLab still only consumes knowledge documents in markdown format. + +- Each knowledge node in the tree has a `qna.yaml`, similar to the format of the `qna.yaml` for skills. +- ⭐ Knowledge submissions require you to create a Git repository, can be with GitHub, that contains the files of your knowledge contributions. +- The `qna.yaml` includes parameters that contain information from your repository. + +> [!TIP] +> Guidelines for Knowledge contributions +> +> - Submit the most up-to-date version of the document +> - All submissions must be text, images will be ignored +> - Do not use tables in your markdown freeform contribution + +The `qna.yaml` format must include the following fields: + +- `version`: The version of the qna.yaml file, this is the format of the file used for SDG. The value must be the number 3. +- `created_by`: Your GitHub username. +- `domain`: Specify the category of the knowledge. +- `seed_examples`: A collection of key/value entries. + - `context`: A chunk of information from the knowledge document. Each `qna.yaml` needs five `context` blocks and has a maximum word count of 500 words. + - `questions_and_answers`: The parameter that holds your questions and answers + - `question`: Specify a question for the model. Each `qna.yaml` file needs at least three question and answer pairs per `context` chunk with a maximum word count of 250 words. + - `answer`: Specify the desired answer from the model. Each `qna.yaml` file needs at least three question and answer pairs per `context` chunk with a maximum word count of 250 words. +- `document_outline`: Describe an overview of the document your submitting. +- `document`: The source of your knowledge contribution. + - `repo`: The URL to your repository that holds your knowledge files. + - `commit`: The SHA of the commit in your repository with your knowledge files. + - `patterns`: A list of glob patterns specifying the files in your repository. Any glob pattern that starts with `*`, such as `*.md`, must be quoted due to YAML rules. For example, `"*.md"`. + +### Knowledge: YAML examples + +```yaml +version: 3 +domain: astronomy +created_by: juliadenham +seed_examples: + - context: | + **Phoenix** is a minor [constellation](constellation "wikilink") in the + [southern sky](southern_sky "wikilink"). Named after the mythical + [phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a + celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603 + *[Uranometria](Uranometria "wikilink")*. The French explorer and + astronomer [Nicolas Louis de + Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter + stars and gave their [Bayer designations](Bayer_designation "wikilink") + in 1756. The constellation stretches from roughly βˆ’39 degrees to βˆ’57 degrees + [declination](declination "wikilink"), and from 23.5h to 2.5h of [right + ascension](right_ascension "wikilink"). The constellations Phoenix, + [Grus](Grus_(constellation) "wikilink"), + [Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"), + are known as the Southern Birds. + questions_and_answers: + - question: | + What is the Phoenix constellation? + answer: | + Phoenix is a minor constellation in the southern sky. + - question: | + Who charted the Phoenix constellation? + answer: | + The Phoenix constellation was charted by french explorer and + astronomer Nicolas Louis de Lacaille. + - question: | + How far does the Phoenix constellation stretch? + answer: | + The phoenix constellation stretches from roughly βˆ’39Β° to βˆ’57Β° + declination, and from 23.5h to 2.5h of right ascension. + - context: | + Phoenix was the largest of the 12 constellations established by [Petrus + Plancius](Petrus_Plancius "wikilink") from the observations of [Pieter + Dirkszoon Keyser](Pieter_Dirkszoon_Keyser "wikilink") and [Frederick de + Houtman](Frederick_de_Houtman "wikilink"). It first appeared on a 35cm + diameter celestial globe published in 1597 (or 1598) in Amsterdam by + Plancius with [Jodocus Hondius](Jodocus_Hondius "wikilink"). The first + depiction of this constellation in a celestial atlas was in [Johann + Bayer](Johann_Bayer "wikilink")'s + *[Uranometria](Uranometria "wikilink")* of 1603. De Houtman included + it in his southern star catalog the same year under the Dutch name *Den + voghel Fenicx*, "The Bird Phoenix", symbolising the + [phoenix](Phoenix_(mythology) "wikilink") of classical mythology. One + name of the brightest star [Alpha + Phoenicis](Alpha_Phoenicis "wikilink")β€”Ankaaβ€”is derived from the Arabic: + Ψ§Ω„ΨΉΩ†Ω‚Ψ§Ψ‘, romanized: al-β€˜anqā’, lit. 'the phoenix', and + was coined sometime after 1800 in relation to the constellation. + questions_and_answers: + - question: | + What is the brightest star in the Phoenix constellation + called? + answer: | + Alpha Phoenicis or Ankaa is the brightest star in the Phoenix + Constellation. + - question: Where did the Phoenix constellation first appear? + answer: | + The Phoenix constellation first appeared on a 35-cm diameter + celestial globe published in 1597 (or 1598) in Amsterdam by + Plancius with Jodocus Hondius. + - question: | + What does "The Bird Phoenix" symbolize? + answer: | + "The Bird Phoenix" symbolizes the phoenix of classical mythology. + - context: | + Phoenix is a small constellation bordered by [Fornax](Fornax "wikilink") + and Sculptor to the north, Grus to the west, Tucana to the south, + touching on the corner of [Hydrus](Hydrus "wikilink") to the south, and + [Eridanus](Eridanus_(constellation) "wikilink") to the east and + southeast. The bright star [Achernar](Achernar "wikilink") is + nearby. The three-letter abbreviation for the constellation, as + adopted by the [International Astronomical + Union](International_Astronomical_Union "wikilink") in 1922, is + "Phe". The official constellation boundaries, as set by Belgian + astronomer [EugΓ¨ne Delporte](EugΓ¨ne_Joseph_Delporte "wikilink") in 1930, + are defined by a polygon of 10 segments. In the [equatorial coordinate + system](equatorial_coordinate_system "wikilink"), the [right + ascension](right_ascension "wikilink") coordinates of these borders lie + between 23h 26.5m and 02h 25.0m, + while the [declination](declination "wikilink") + coordinates are between βˆ’39.31Β° and βˆ’57.84Β°. This means it remains + below the horizon to anyone living north of the [40th + parallel](40th_parallel_north "wikilink") in the [Northern + Hemisphere](Northern_Hemisphere "wikilink"), and remains low in the sky + for anyone living north of the [equator](equator "wikilink"). It is most + visible from locations such as Australia and South Africa during late + [Southern Hemisphere](Southern_Hemisphere "wikilink") spring. Most + of the constellation lies within, and can be located by, forming a + triangle of the bright stars Achernar, [Fomalhaut](Fomalhaut "wikilink") + and [Beta Ceti](Beta_Ceti "wikilink")β€”Ankaa lies roughly in the centre + of this. + questions_and_answers: + - question: What are the characteristics of the Phoenix constellation? + answer: | + Phoenix is a small constellation bordered by Fornax and Sculptor to + the north, Grus to the west, Tucana to the south, touching on the + corner of Hydrus to the south, and Eridanus to the east and southeast. + The bright star Achernar is nearby. + - question: | + When is the phoenix constellation most visible? + answer: | + Phoenix is most visible from locations such as Australia and + South Africa during late Southern Hemisphere spring. + - question: | + What are the Phoenix Constellation boundaries? + answer: | + The official constellation boundaries for Phoenix, as set by Belgian + astronomer EugΓ¨ne Delporte in 1930, are defined by a polygon of 10 + segments. + - context: | + Ten stars have been found to have planets to date, and four planetary + systems have been discovered with the [SuperWASP](SuperWASP "wikilink") + project. [HD 142](HD_142 "wikilink") is a yellow giant that has an + apparent magnitude of 5.7, and has a planet ([HD 142b](HD_142_b + "wikilink")) 1.36 times the mass of Jupiter which orbits every 328 days. + [HD 2039](HD_2039 "wikilink") is a yellow subgiant with an apparent + magnitude of 9.0 around 330 light years away which has a planet ([HD 2039 + b](HD_2039_b "wikilink")) six times the mass of Jupiter. [WASP-18](WASP-18 + "wikilink") is a star of magnitude 9.29 which was discovered to have a hot + Jupiter-like planet ([WASP-18b](WASP-18b "wikilink")) taking less than a + day to orbit the star. The planet is suspected to be causing WASP-18 to + appear older than it really is. [WASP-4](WASP-4 "wikilink") and + [WASP-5](WASP-5 "wikilink") are solar-type yellow stars around 1000 + light years distant and of 13th magnitude, each with a single planet + larger than Jupiter. [WASP-29](WASP-29 "wikilink") is an orange + dwarf of spectral type K4V and visual magnitude 11.3, which has a + planetary companion of similar size and mass to Saturn. The planet + completes an orbit every 3.9 days. + questions_and_answers: + - question: In the Phoenix constellation, how many stars have planets? + answer: | + In the Phoenix constellation, ten stars have been found to have + planets to date, and four planetary systems have been discovered + with the SuperWASP project. + - question: | + What is HD 142? + answer: | + HD 142 is a yellow giant that has an apparent magnitude of 5.7, and + has a planet (HD 142 b) 1.36 times the mass of Jupiter which + orbits every 328 days. + - question: | + Are WASP-4 and WASP-5 solar-type yellow stars? + answer: | + Yes, WASP-4 and WASP-5 are solar-type yellow stars around 1000 light + years distant and of 13th magnitude, each with a single planet + larger than Jupiter. + - context: | + The constellation does not lie on the + [galactic plane](galactic_plane "wikilink") of the Milky Way, and there + are no prominent star clusters. [NGC 625](NGC_625 "wikilink") is a dwarf + [irregular galaxy](irregular_galaxy "wikilink") of apparent magnitude 11.0 + and lying some 12.7 million light years distant. Only 24000 light years in + diameter, it is an outlying member of the [Sculptor Group](Sculptor_Group + "wikilink"). NGC 625 is thought to have been involved in a collision and + is experiencing a burst of [active star formation](Active_galactic_nucleus + "wikilink"). [NGC 37](NGC_37 "wikilink") is a + [lenticular galaxy](lenticular_galaxy "wikilink") of apparent magnitude + 14.66. It is approximately 42 [kiloparsecs](kiloparsecs "wikilink") + (137,000 [light-years](light-years "wikilink")) in diameter and about + 12.9 billion years old. [Robert's Quartet](Robert's_Quartet "wikilink") + (composed of the irregular galaxy [NGC 87](NGC_87 "wikilink"), and three + spiral galaxies [NGC 88](NGC_88 "wikilink"), [NGC 89](NGC_89 "wikilink") + and [NGC 92](NGC_92 "wikilink")) is a group of four galaxies located + around 160 million light-years away which are in the process of colliding + and merging. They are within a circle of radius of 1.6 arcmin, + corresponding to about 75,000 light-years. Located in the galaxy ESO + 243-49 is [HLX-1](HLX-1 "wikilink"), an + [intermediate-mass black hole](intermediate-mass_black_hole + "wikilink")β€”the first one of its kind identified. It is thought to be a + remnant of a dwarf galaxy that was absorbed in a + [collision](Interacting_galaxy "wikilink") with ESO 243-49. Before its + discovery, this class of black hole was only hypothesized. + questions_and_answers: + - question: | + Is the Phoenix Constellation part of the Milky Way? + answer: | + The Phoenix constellation does not lie on the galactic plane of + the Milky Way, and there are no prominent star clusters. + - question: | + How many light years away is NGC 625? + answer: | + NGC 625 is 24000 light years in diameter and is an outlying + member of the Sculptor Group. + - question: | + What is Robert's Quartet composed of? + answer: | + Robert's Quartet is composed of the irregular galaxy NGC 87, + and three spiral galaxies NGC 88, NGC 89 and NGC 92. +document_outline: | + Information about the Phoenix Constellation including the + history, characteristics, and features of the stars in the constellation. +document: + repo: https://github.com/juliadenham/Summit_knowledge + commit: 0a1f2672b9b90582e6115333e3ed62fd628f1c0f + patterns: + - phoenix_constellation.md + +``` + +*Example `attribution.txt` file* + +```text +Title of work: Phoenix (constellation) +Link to work: https://en.wikipedia.org/wiki/Phoenix_(constellation) +Revision: https://en.wikipedia.org/w/index.php?title=Phoenix_(constellation)&oldid=1237187773 +License of the work: CC-BY-SA-4.0 +Creator names: Wikipedia Authors +``` + +This knowledge example references one markdown file: `phoenix_constellation.md`. You can also add multiple files for knowledge contributions. + +> [!NOTE] +> Due to the higher volume, **it will naturally take longer to receive acceptance for +> a knowledge contribution pull request than for a skill pull request**. Smaller +> pull requests are simpler and require less time and effort to review. + +What might these markdown files look like? They can be freeform. Here's what a +snippet of `phoenix_constellation.md` might look like in your Git repository. + +#### Knowledge: Markdown file example + +```markdown +# Phoenix (constellation) + +**Phoenix** is a minor [constellation](constellation "wikilink") in the +[southern sky](southern_sky "wikilink"). Named after the mythical +[phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a +celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603 +*[Uranometria](Uranometria "wikilink")*. The French explorer and +astronomer [Nicolas Louis de +Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter +stars and gave their [Bayer designations](Bayer_designation "wikilink") +in 1756. The constellation stretches from roughly βˆ’39 degrees to βˆ’57 degrees +[declination](declination "wikilink"), and from 23.5h to 2.5h of [right +ascension](right_ascension "wikilink"). The constellations Phoenix, +[Grus](Grus_(constellation) "wikilink"), +[Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"), +are known as the Southern Birds. + +The brightest star, [Alpha Phoenicis](Alpha_Phoenicis "wikilink"), is +named Ankaa, an [Arabic](Arabic "wikilink") word meaning 'the Phoenix'. +It is an orange giant of apparent magnitude 2.4. Next is [Beta +Phoenicis](Beta_Phoenicis "wikilink"), actually a +[binary](Binary_star "wikilink") system composed of two yellow giants +with a combined apparent magnitude of 3.3. [Nu +Phoenicis](Nu_Phoenicis "wikilink") has a dust disk, while the +constellation has ten star systems with known planets and the recently +discovered [galaxy clusters](galaxy_cluster "wikilink") [El +Gordo](El_Gordo_(galaxy_cluster) "wikilink") and the [Phoenix +Cluster](Phoenix_Cluster "wikilink")β€”located 7.2 and 5.7 billion light +years away respectively, two of the largest objects in the [visible +universe](visible_universe "wikilink"). Phoenix is the +[radiant](radiant_(meteor_shower) "wikilink") of two annual [meteor +showers](meteor_shower "wikilink"): the +[Phoenicids](Phoenicids "wikilink") in December, and the July +Phoenicids. +``` + +In the taxonomy repository, here's what the previously referenced knowledge might look like in the tree: + +#### Knowledge: directory tree example + +```ascii +[...] + +└── knowledge + └── science + β”œβ”€β”€ astronomy + β”‚ └── constellations + β”‚ └── Phoenix <=== here it is :) + β”‚ | └── qna.yaml + | | attribution.txt + β”‚ └── Orion + β”‚ └── qna.yaml + | attribution.txt +[...] +``` + +For more information on what to include in your `attribution.txt` file, see [For your attribution.txt file](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#for-your-attributiontxt-file) in CONTRIBUTING.md. + +You can organize the knowledge markdown files in your repository however you want. You just need to ensure the YAML is pointing to the correct file. + +## Taxonomy tree Layout + +The taxonomy tree is organized in a cascading directory structure. At the end of +each branch, there is a YAML file (qna.yaml) that contains the examples for that +domain. Maintainers can decide to change the names of the existing branches or to add new branches. + +> [!IMPORTANT] +> Folder names do not have spaces. Use underscores between words. + +Below is an illustrative directory structure to show this layout: + +```ascii +. +└── linguistics + β”œβ”€β”€ writing + β”‚ β”œβ”€β”€ brainstorming + β”‚ β”‚ β”œβ”€β”€ idea_generation + | β”‚ └── qna.yaml + β”‚ β”‚ attribution.txt + β”‚ β”‚ β”œβ”€β”€ refute_claim + | β”‚ └── qna.yaml + β”‚ β”‚ attribution.txt + β”‚ β”œβ”€β”€ prose + β”‚ β”‚ β”œβ”€β”€ articles + β”‚ β”‚ └── qna.yaml + β”‚ β”‚ attribution.txt + └── grammar + └── qna.yaml + β”‚ attribution.txt + └── spelling + └── qna.yaml + attribution.txt +``` + +For an extensive example of this layout see, [taxonomy_tree_layout](docs/taxonomy_diagram.md) in the documentation folder. + +## Contribute knowledge and skills to the taxonomy + +The ability to contribute to a Large Language Model (LLM) has been difficult in no small part because it is difficult to get access to the necessary compute infrastructure. + +This taxonomy repository will be used as the seed to synthesize the training data for InstructLab-trained models. We intend to retrain the model(s) using the main branch following InstructLab's progressive training on a regular basis. This enables fast iteration of the model(s), for the benefit of the open source community. + +By contributing your skills and knowledge to this repository, you will see your changes built into an LLM within days of your contribution rather than months or years! If you are working with a model and notice its knowledge or ability lacking, you can correct it by contributing knowledge or skills and check if it's improved after your changes are built. + +While public contributions are welcome to help drive community progress, you can also fork this repository under [the Apache License, Version 2.0](LICENSE), add your own internal skills, and train your own models internally. However, you might need your own access to significant compute infrastructure to perform sufficient retraining. + +## Ways to Contribute + +You can contribute to the taxonomy in the following two ways: + +1. Adding new examples to **existing leaf nodes**: +2. Adding **new branches/skills** corresponding to the existing domain: + +For more information, see the [Ways of contributing to the taxonomy repository](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#ways-of-contributing-to-the-taxonomy-repository) documentation. + +## How to contribute skills and knowledge + +To contribute to this repo, you'll use the *Fork and Pull* model common in many open source repositories. You can add your skills and knowledge to the taxonomy in multiple ways; for additional information on how to make a contribution, see the [Documentation on contributing](CONTRIBUTING.md). You can also use the following guides to help with contributing: + +- Contributing using the [GitHub webpage UI](docs/contributing_via_GH_UI.md). +- Contributing knowledge to the taxonomy in the [Knowledge contribution guidelines](docs/knowledge-contribution-guide.md). + +### Why should I contribute? + +This taxonomy repository will be used as the seed to synthesize the training +data for InstructLab-trained models. We intend to retrain the model(s) using the main +branch as often as possible (at least weekly). +Fast iteration of the model(s) benefits the open source community and enables model developers who do not have access to the necessary compute infrastructure. diff --git a/taxonomy/SECURITY.md b/taxonomy/SECURITY.md new file mode 100644 index 000000000..b37d8f1cb --- /dev/null +++ b/taxonomy/SECURITY.md @@ -0,0 +1 @@ +You can find information on how to report a potential security vulnerability, as well as where to subscribe to receive security alerts, on the project's [Security Page](https://github.com/instructlab/.github/blob/main/SECURITY.md). diff --git a/taxonomy/compositional_skills/arts/.gitignore b/taxonomy/compositional_skills/arts/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/engineering/.gitignore b/taxonomy/compositional_skills/engineering/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/geography/.gitignore b/taxonomy/compositional_skills/geography/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/grounded/arts/.gitignore b/taxonomy/compositional_skills/grounded/arts/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/grounded/engineering/.gitignore b/taxonomy/compositional_skills/grounded/engineering/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/grounded/geography/.gitignore b/taxonomy/compositional_skills/grounded/geography/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/grounded/history/.gitignore b/taxonomy/compositional_skills/grounded/history/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/grounded/linguistics/.gitignore b/taxonomy/compositional_skills/grounded/linguistics/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/grounded/linguistics/inclusion/attribution.txt b/taxonomy/compositional_skills/grounded/linguistics/inclusion/attribution.txt new file mode 100644 index 000000000..39d183669 --- /dev/null +++ b/taxonomy/compositional_skills/grounded/linguistics/inclusion/attribution.txt @@ -0,0 +1,15 @@ +Title of work: Master-slave(technology) +Link to work: https://en.wikipedia.org/wiki/Master%E2%80%93slave_(technology) +License of the work: CC-BY-SA-4.0 +Creator names: Wikipedia Authors + +Title of work: Whitelist +Link to work: https://en.wikipedia.org/wiki/Whitelist +License of the work: CC-BY-SA-4.0 +Creator names: Wikipedia Authors + + +Title of work: Blacklist (computing) +Link to work: https://en.wikipedia.org/wiki/Blacklist_(computing) +License of the work: CC-BY-SA-4.0 +Creator names: Wikipedia Authors \ No newline at end of file diff --git a/taxonomy/compositional_skills/grounded/linguistics/inclusion/qna.yaml b/taxonomy/compositional_skills/grounded/linguistics/inclusion/qna.yaml new file mode 100644 index 000000000..6479ce09f --- /dev/null +++ b/taxonomy/compositional_skills/grounded/linguistics/inclusion/qna.yaml @@ -0,0 +1,54 @@ +--- +version: 3 +task_description: This skill helps the writer use more inclusive IT terminology. +created_by: imstilllearning +seed_examples: + - context: > + In database replication, the master database is regarded as the + authoritative source, and the slave databases are synchronized to it. + question: How would you rewrite this sentence to use more inclusive IT terminology? + answer: > + In database replication, the primary database is regarded as the + authoritative source, and the secondary databases are syncrhonized to it. + - context: > + Blacklisting is the action of a group or authority compiling a + blacklist of people, countries, or other entites to be avoided or + distrusted as being deemed unacceptable to those making the list. + question: Show me how to use inclusive IT terminology to replace non-inclusive + IT terminology. + answer: > + Denylisting is the action of a group or authority compiling a denylist + of people, countries, or other entites to be avoided or distrusted as + being deemed unacceptable to those making the list. + - context: > + In database replication, the master database is regarded as the + authoritative source, and the slave databases are synchronized to it. + question: How would you revise this sentence to use Inclusive IT terminology? + answer: > + In database replication, the parent database is regarded as the + authoritative source, and the child databases are synchronized to it. + - context: > + A use for whitelists is in local area network (LAN) security. Many + network admins set up MAC address whitelists, or a MAC address filter, to + control who is allowed on their networks. + question: How would you revise this sentence to use inclusive IT terminology? + answer: > + A use for allowlists is in local area network (LAN) security. Many + network admins set up MAC address allowlists, or a MAC address filter, to + control who is allowed on their networks. + - context: > + A master clock that provides time signals used to synchronize one or + more slave clocks as a part of a clock network. + question: How would you improve this sentence to use inclusive IT language? + answer: > + A primary clock that provides time signals used to synchronize one or + more secondary clocks as a part of a clock network. + - context: > + As a verb, blacklist can mean to put an individual or entity on such a + list. A blacklist is synonymous with a list of banned persons or + organizations and is the opposite of a whitelist. + question: How would you adjust the word choice to use Inclusive IT terminology? + answer: > + As a verb, denylist can mean to put an individual or entity on such a + list. A denylist is synonymous with a list of banned persons or + organizations and is the opposite of an allowlist. diff --git a/taxonomy/compositional_skills/grounded/linguistics/writing/rewriting/attribution.txt b/taxonomy/compositional_skills/grounded/linguistics/writing/rewriting/attribution.txt new file mode 100644 index 000000000..1ced667f3 --- /dev/null +++ b/taxonomy/compositional_skills/grounded/linguistics/writing/rewriting/attribution.txt @@ -0,0 +1,4 @@ +Title of work: Rewrite complex texts in an easier way +Link to work: - +License of the work: CC-BY-SA-4.0 +Creator names: Alicia JimΓ©nez AvilΓ©s \ No newline at end of file diff --git a/taxonomy/compositional_skills/grounded/linguistics/writing/rewriting/qna.yaml b/taxonomy/compositional_skills/grounded/linguistics/writing/rewriting/qna.yaml new file mode 100644 index 000000000..b13a38a08 --- /dev/null +++ b/taxonomy/compositional_skills/grounded/linguistics/writing/rewriting/qna.yaml @@ -0,0 +1,33 @@ +created_by: aliciaibm +version: 3 +seed_examples: + - answer: 'Scientists are studying how our genes and the world around us affect our + health.' + context: 'The elucidation of the intricate interplay between genetic + predisposition and environmental factors remains a focal point of + contemporary biomedical research.' + question: Could you rewrite this text in a simpler way? + - answer: 'Technology is growing fast, and now we use digital tools to talk and + connect in new ways.' + context: 'The exponential growth of technological innovation has led to the + proliferation of digital platforms, fundamentally transforming the way we + communicate and interact.' + question: Rewrite the text in an easier way + - answer: 'Scientists have been figuring out how tiny particles act using quantum + mechanics, a big idea in physics.' + context: 'The elucidation of the implications of quantum mechanics for the + behavior of subatomic particles has been a cornerstone of modern physics.' + question: Could you give me the text rewritten for beter understanding? + - answer: 'It is hard for policymakers and bioethicists to make rules for new + biotechnologies that everyone agrees are fair.' + context: 'The articulation of ethical frameworks for the regulation of emerging + biotechnologies presents a significant challenge for policymakers and + bioethicists alike.' + question: Could you tell me in a simple way what the text is saying? + - answer: 'We need ideas from lots of different fields to solve big problems and + come up with new stuff.' + context: 'The synthesis of diverse perspectives from interdisciplinary fields is + crucial for fostering innovation and addressing complex societal + challenges.' + question: Rewrite the text in order to I understand better +task_description: 'Rewrite complex texts in an easier way' diff --git a/taxonomy/compositional_skills/grounded/philosophy/.gitignore b/taxonomy/compositional_skills/grounded/philosophy/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/grounded/religion/.gitignore b/taxonomy/compositional_skills/grounded/religion/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/grounded/science/.gitignore b/taxonomy/compositional_skills/grounded/science/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/grounded/technology/.gitignore b/taxonomy/compositional_skills/grounded/technology/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/history/.gitignore b/taxonomy/compositional_skills/history/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/linguistics/.gitignore b/taxonomy/compositional_skills/linguistics/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/linguistics/synonyms/attribution.txt b/taxonomy/compositional_skills/linguistics/synonyms/attribution.txt new file mode 100644 index 000000000..60ed5b93d --- /dev/null +++ b/taxonomy/compositional_skills/linguistics/synonyms/attribution.txt @@ -0,0 +1,4 @@ +Title of work: To teach a language model how to get a certain number of synonyms. +Link to work: - +License of the work: CC BY-NC-SA 4.0 +Creator names: Yanping Liu diff --git a/taxonomy/compositional_skills/linguistics/synonyms/qna.yaml b/taxonomy/compositional_skills/linguistics/synonyms/qna.yaml new file mode 100644 index 000000000..76fe4241e --- /dev/null +++ b/taxonomy/compositional_skills/linguistics/synonyms/qna.yaml @@ -0,0 +1,66 @@ +version: 3 +created_by: yanpliu +seed_examples: + - answer: 'Synonym for Attend is take part in + + ' + question: List a synonym for the word attend. + - answer: 'Two synonyms for Attend are + + take part in + + be present at + + ' + question: List two synonyms for the word attend and separate with newline. + - answer: 'Three synonyms for Attend are + + take part in + + be present at + + be there at + + ' + question: List three synonyms for the word attend and separate with newline. + - answer: 'Four synonyms for Attend are + + take part in + + be present at + + be there at + + make an appearance at + + ' + question: List four synonyms for the word beautiful and separate with newline. + - answer: 'Five synonyms for Beautiful are + + attractive + + pretty + + gorgeous + + good-looking + + handsome + + ' + question: List five synonyms for the word beautiful and separate with newline. + - answer: 'Five antonyms for attend are + + absent + + neglect + + disregard + + disown + + ignore + + ' + question: List five antonyms for the word attend and separate with newline. +task_description: to teach a large language model to list the synonyms for a word. diff --git a/taxonomy/compositional_skills/miscellaneous_unknown/.gitignore b/taxonomy/compositional_skills/miscellaneous_unknown/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/philosophy/.gitignore b/taxonomy/compositional_skills/philosophy/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/religion/.gitignore b/taxonomy/compositional_skills/religion/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/science/.gitignore b/taxonomy/compositional_skills/science/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/compositional_skills/technology/.gitignore b/taxonomy/compositional_skills/technology/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/docs/KNOWLEDGE_GUIDE.md b/taxonomy/docs/KNOWLEDGE_GUIDE.md new file mode 100644 index 000000000..3e5831be8 --- /dev/null +++ b/taxonomy/docs/KNOWLEDGE_GUIDE.md @@ -0,0 +1,152 @@ +# Knowledge Guide + +## What is "Knowledge"? + +Knowledge consists of data and facts and is backed by documents. When you create knowledge for a model, you're giving it additional data to more accurately answer questions. + +Knowledge contributions in this project contain a few things. + +- A file in a git repository that holds your information. For example, these repositories can include markdown versions of information on: Oscar 2024 winners, Law books, Shakespeare, Sports, Chemistry, etc. +- A `qna.yaml` file that asks and answers questions about the information in the git repository. +- A `attribution.txt` that includes the sources for the information used in the `qna.yaml`. + +You can learn more about the knowledge structure in [Getting Started with Knowledge contributions](https://github.com/instructlab/taxonomy/blob/main/README.md#getting-started-with-knowledge-contributions). + +## Accepted Knowledge + +> [!IMPORTANT] +> We are currently only accepting knowledge contributions as a limited private beta and sources will be limited to articles from Wikipedia. + +There are a few domains of knowledge that we are currently accepting. For a full list of knowledge fields, see [Knowledge domains](https://github.com/instructlab/taxonomy/blob/main/knowledge/knowledge_domains.md) in the taxonomy documentation + +A few examples are as follows: + +### STEM fields + +- Physics + - Astronomy and Astrophysics + - Quantum Mechanics + - Special Relativity and General Relativity + +- Chemistry & Chemical Engineering + - Organic Chemistry + - Inorganic Chemistry + - Chemical engineering + - Biotechnology + +- Earth & Environmental Science + - Geology + - Geography + +- Biology & Life Sciences + - Plants (Botany) + - Medicine & health + +- Electrical Engineering +- Bioengineering +- Civil Engineering +- Industrial Engineering + +### Legal and regulatory + +- Intellectual Property +- Criminal Law +- Civil Rights +- Healthcare compliance + +### Economy and Business + +- Economy and Businesses +- Accounting and Finance +- Marketing +- Human Resource +- Management + +### Philosophy + +- Philosophy +- Metaphysics +- Epistemology +- Ethics +- Parapsychology & occultism +- Philosophical schools of thought + +### Literature + +- Literature, rhetoric & criticism +- American literature in English +- Other literatures + +## Avoid These Topics + +While the tuning process may eventually benefit from being used to help the models work with complex social topics, at this time this is an area of active research we do not want to take lightly. Therefore please keep your submissions clear of the following topics: + +- PII (personally identifiable information) or any content invasive of individual privacy rights +- Violence including self-harm +- Cyber Bullying +- Internal documentation or other that is confidential to your employer or organization, e.g. trade secrets +- Discrimination +- Religion + - Facts such as, "[Christianity is, according to the 2011 census, the fifth most practiced religion in Nepal, with 375,699 adherents, or 1.4% of the population](https://en.wikipedia.org/wiki/Christianity_in_Nepal)", are fine as a knowledge contribution. Advocating in favor of or against any religious faith is not acceptable. +- Medical or health information + - Facts such as, "[In mammals, pulmonary ventilation occurs via inhalation (breathing)](https://opentextbc.ca/biology/chapter/11-3-circulatory-and-respiratory-systems/)," are fine as a knowledge contribution. Tailored medical/health advice is not acceptable. +- Financial information + - Facts such as "[laissez-faire economics ... argues that market forces alone should drive the economy and that governments should refrain from direct intervention in or moderation of the economic system](https://openstax.org/books/world-history-volume-2/pages/6-3-capitalism-and-the-first-industrial-revolution)," are fine as a knowledge contribution. Tailored financial advice is not acceptable. +- Legal settlements/mitigations +- Gender Bias +- Hostile Language, threats, slurs, derogatory or insensitive jokes or comments +- Profanity +- Pornography and sexually explicit or suggestive content +- Any contributions that would allow for automated decision making that affect an individual's rights or well-being, e.g. social scoring +- Any contributions that engage in political campaigning or lobbying + +We are also not accepting submissions of the following content: + +- Code + - Anything code-related that can be traced back to code for a computer. Not limited to `sed` or `bash` but `yaml`s for OpenShift or Kubernetes, to `python` snippets to `Java` suggestions. There are specific models focused on this space and this isn't for this model for the time being. +- Jokes +- Poems + +We received many joke and poem submissions at the beginning of the project, and with jokes being "in the eye of the beholder" and puns requiring nuance for native English speakers, we realized we were possibly unconsciously biasing our model. We have discovered that working with both topics has its own challenges, and if we want something generalized, finding consensus was unsuccessful. For now, we're not accepting additional submissions of jokes and poems. + +## Building Your LLM Intuition + +LLMs have inherent limitations that make certain tasks extremely difficult, like doing math problems. They're great at other tasks, like creative writing. And they could be better at things like logical reasoning. + +An LLM with knowledge helps it create a basis of information that it can learn from, then you can teach it to use this knowledge via the `qna.yaml` files. + +For example, you can give an LLM the entire periodic table, then in a `qna.yaml` add something like: + +question: What is the symbol and atomic number for Chlorine? +answer: | + The symbol for chlorine is Cl and the atomic number is 17. + +With a few of these qna's, the model will learn the periodic table because it has the knowledge data. + +### LLMs are great at + +For these, however, it's common for LLMs to already have excellent performance. Try 3-5 examples in `lab chat` to confirm a deficit in the model before you build your submission, and share the examples in your Pull Request (PR). + +- Brainstorming +- Creativity +- Connecting information +- Cross-lingual behavior + +### LLMs need help with + +LLM behavior in these sorts of topics are very difficult for the model to get right. Try several examples to understand the nuances of the model's ability to do these sorts of tasks, and consider using corrections to the results you get in your tuning process. + +- Chains of reasoning +- Analysis +- Story plots +- Reassembling information +- Effective and succinct summaries + +### LLMs are not so great at + +LLMs may struggle with solving math and computation. That said, improving some of these foundational skills may be something this work tackles in the future, but not at this time. + +- Math +- Computation +- "Turing-complete" type tasks +- Generating only true real-world information (they're prone to hallucinations) diff --git a/taxonomy/docs/README.md b/taxonomy/docs/README.md new file mode 100644 index 000000000..134d811f1 --- /dev/null +++ b/taxonomy/docs/README.md @@ -0,0 +1,36 @@ +# Docs + +The purpose of these documents is to inform users and maintainers on the review/triaging process and different ways to contribute to the taxonomy repository. You can view the following: + +- Full layout of taxonomy repository - [Taxonomy diagram](taxonomy_diagram.md) +- Flowchart of the pull request review process - [Review process](assets/review-process.png) +- Flowchart of how the backend works with reviewing PRS - [Backend process](assets/backend.png) + +## Overview of the Review Process + +The review process starts with contributor drafting the PR. + +1. At the PR stage, reviewers manually check its contents (e.g. making sure the examples are added to the correct path, inspecting the contents of examples). + - If not valid, go back to the contributor and ask them to take actions. +2. If a PR passes step 1, the SDG will be triggered to generate synthetic data samples. + - If not valid, go back to the contributor and ask them to take actions. +3. If step 2 passes, it will be used in the next model update. + +See the [README.md in `instructlab/docs`](https://github.com/instructlab/instructlab/blob/main/docs/README.md) on how to modify and render the flowcharts. + +For more information on the review process, see [Pull request review in CONTRIBUTING.md](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#pull-request-review) + +## Triaging documentation + +For more information on triaging contributions pull requests, see: + +- [Safe responses for common PR mistakes](triaging/safe-responses.md) +- [Triaging guide](triaging/triaging-contributions.md) + +## Contributing documentation + +For documentation contribution processes, see: + +- [Contributing using the GitHub webpage UI](contributing_via_GH_UI.md) +- [Knowledge contribution guidelines](knowledge-contribution-guide.md) +- [CONTRIBUTING.md](../CONTRIBUTING.md) diff --git a/taxonomy/docs/SKILLS_GUIDE.md b/taxonomy/docs/SKILLS_GUIDE.md new file mode 100644 index 000000000..50205944c --- /dev/null +++ b/taxonomy/docs/SKILLS_GUIDE.md @@ -0,0 +1,226 @@ +# Skills Guide + +## What is a "Skill"? + +There are various types of skills that you can contribute to the taxonomy. + +### Compositional Skills + +Skills are performative. When you create a skill for the model, you're +teaching it how to do something: "write me a song," "rearrange words in a sentence" or +"summarize an email." + +There are two types of compositional skills: + +#### Freeform Compositional Skills + +Freeform compositional skills are performative and do **not** require additional context. An example of a freeform skill is teaching the model words that rhyme. You could provide examples of "words that rhyme with 'tool'". By providing those examples, you're essentially tickling the latent knowledge of the LLM. In our example, you're enabling the LLM to be able to identify words that rhyme in its latent knowledge. + +Freeform skills include things like: + +* Speak like Yoda +* Convert to camel case +* Write me a limerick +* Generate StabeDiffusion prompts + +#### Grounded Compositional Skills + +Grounded skills are performative and **do** require additional context. An +example of a grounded skill would be to read the value of a cell in a table +layout, or to parse a JSON file. To create a grounded skill to read a +markdown formatted table layout, the additional context could be an example +table layout. This additional context is including in the YAML for the +skill and not external to it. + +> [!NOTE] +> The content of the table layout will not be used in training +> or aligning the model; only the table layout format itself will be used. + +Grounded skills include things like: + +* Game creation like Sudoku or tic tac toe +* Summarizing or extracting from a piece of text +* Find unresolved items in a meeting transcript + +[Example Grounded compositional skill pull request](https://github.com/instructlab/taxonomy/pull/250) + +### Core Skills + +Core skills are foundational skills like math, reasoning, and coding. + +> [!NOTE] +> Unlike **knowledge** and **compositional skills**, core skills +> are not contributable to the tree. So when you see reference to contributing +> "skills" to the taxonomy from this point forward, it is **compositional +> skills** that are being referenced. + +## Accepted Skills + +### Creative Writing / Poetics + +Adding new types of documents and writing styles to the LLM are welcome. Consider: + +* Song lyrics +* Soliloquies +* Five paragraph essays +* Arguments + +### Learning to Format Information + +Skills to better format and reassemble information are helpful. + +### Table Analysis and Processing + +Consider: + +* Drawing verbal inferences and conclusions about what's in a table +* Sorting +* Selecting +* Joining + +### Qualitative Inference and Chain-of-Thought Reasoning + +Example: + +> Mary is taller than John. +> John is taller than Anna. +> Is Anna taller than Mary? + +Example: + +> An elephant, a mouse and a horse are in a room. How would they be ordered if they were standing in order by size? + +Great skills in this category should include the correct line of reasoning in the answer, not just what the answer is. + +### Word Problems + +Is your LLM smarter than a second grader? + +### Trust and Safety + +Please avoid HAP (hate, abuse and profanity) and PII (personal identifiable information) in your examples. + +Anything related to trust and safety will be flagged for higher-level review. + +### Searching, Extraction and Summarization + +Skills to select odd information in a document, draw conclusions, pull out information, draw insights or generate TODOs from information provided in the "context" field are welcome. + +### Complex Rulesets and Games + +> [!NOTE] +> This is a good example of the need for a *grounded skill*. Grounded skills require the user to provide context containing information that the model is expected to take into account during processing. This is different from *knowledge*, where the model is expected to gain facts and background knowledge from the tuning process. +> +> Context added when tuning a grounded skill would need to be again provided by the end user at inference time. The skill here is better adherence to the rule set. + +To add a skill for a structured game or other task with a complex rule set, use a grounded skill. Add the rules to the game as "context" in every example. Add the interpretation as a question. + +### Writing Style and Personalities + +When adding a skill, expect that you're tuning a fairly general purpose LLM to behave better given particular circumstances. + +If you want to add a skill to better adopt a particular personality - say, "a little boy living in the 1800s" - that context needs to be provided in either the "context" or "question" field. + +### Instruction-Following Behavior + +LLMs could be better at following extra instructions in a prompt about how to do a task, such as: "Keep your response to 200 words." Or: "Only produce 10 items." Skills to improve this behavior will help the model behave with more precision. + +## Skills to Avoid + +There are several types of skills that we don't expect this procedure to improve. Most skills in these categories will be rejected. + +### Math + +Trying to make the LLM solve math problems will be rejected. + +### Real world knowledge-based skills + +Unless it can be framed as a "grounded skill", where the user is expected to provide context, knowledge contributions will be a separate part of the taxonomy. Skills shouldn't expect the model to come up with its own facts, but instead assemble facts provided. + +### Red Teaming + +Adversarial questions and answers will be rejected at this time. + +### Turing-complete style problems + +These are an edge case, but things like palindromes and regexes, where getting the right answer with a non-stochastic program would be easy, aren't good targets for the moment. + +Open an issue in the taxonomy repository if you have an idea in this space before submitting a PR. + +### Small Changes to Original Response + +If the original LLM response is pretty close, but it's not responding to your exact expectations, a skill is not the right way to solve that problem. + +## Avoid These Topics + +While the tuning process may eventually benefit from being used to help the models work with complex social topics, at this time this is an area of active research we do not want to take lightly. Therefore please keep your submissions clear of the following topics: + +* PII (personally identifiable information) or any content invasive of individual privacy rights +* Violence including self-harm +* Cyber Bullying +* Internal documentation or other that is confidential to your employer or organization, e.g. trade secrets +* Discrimination +* Religion + * Facts such as, "[Christianity is, according to the 2011 census, the fifth most practiced religion in Nepal, with 375,699 adherents, or 1.4% of the population](https://en.wikipedia.org/wiki/Christianity_in_Nepal)", are fine as a knowledge contribution. Advocating in favor of or against any religious faith is not acceptable. +* Medical or health information + * Facts such as, "[In mammals, pulmonary ventilation occurs via inhalation (breathing)](https://opentextbc.ca/biology/chapter/11-3-circulatory-and-respiratory-systems/)," are fine as a knowledge contribution. Tailored medical/health advice is not acceptable. +* Financial information + * Facts such as "[laissez-faire economics ... argues that market forces alone should drive the economy and that governments should refrain from direct intervention in or moderation of the economic system](https://openstax.org/books/world-history-volume-2/pages/6-3-capitalism-and-the-first-industrial-revolution)," are fine as a knowledge contribution. Tailored financial advice is not acceptable. +* Legal settlements/mitigations +* Gender Bias +* Hostile Language, threats, slurs, derogatory or insensitive jokes or comments +* Profanity +* Pornography and sexually explicit or suggestive content +* Any contributions that would allow for automated decision making that affect an individual's rights or well-being, e.g. social scoring +* Any contributions that engage in political campaigning or lobbying + +We are also not accepting submissions of the following content: + +* Jokes +* Poems +* Code + * Anything code-related that can be traced back to code for a computer. Not limited to `sed` or `bash` but `yaml`s for OpenShift or Kubernetes, to `python` snippets to `Java` suggestions. There are specific models focused on this space and this isn't for this model for the time being. +* "Guard Rails" for AI + * We expect our upstream engineering team to create these types of skills and safe guards. We appriciate our community wanting to help with this, but there are underlying engineering decisions and taking this from the community may conflict with these. + +We received so many at the beginning, and with jokes being "in the eye of the beholder" and puns requiring nuance for native English speakers, we realized we were possibly unconsciously biasing our model. We have discovered that working with both topics has its own challenges, and if we want something generalized, finding consensus was unsuccessful. + +## Building Your LLM Intuition + +LLMs have inherent limitations that make certain tasks extremely difficult, like doing math problems. They're great at other tasks, like creative writing. And they could be better at things like logical reasoning. + +Consider these when you're generating skills. Skills in the first and second categories are welcomed. Skills in the third category are usually borderline and may be rejected. + +### LLMs are great at + +Skills in this category are welcomed, as refining these abilities helps us get better at the kinds of tasks where LLMs can excel. + +For these, however, it's common for LLMs to already have excellent performance. Try 3-5 examples in `lab chat` to confirm a deficit in the model before you build your submission, and share the examples in your Pull Request (PR). + +* Brainstorming +* Creativity +* Connecting information +* Cross-lingual behavior + +### LLMs need help with + +Skills in this category are welcomed, since LLM behavior in these sorts of topics are very difficult for the model to get right. Try several examples to understand the nuances of the model's ability to do these sorts of tasks, and consider using corrections to the results you get in your tuning process. + +* Chains of reasoning +* Analysis +* Story plots +* Reassembling information +* Effective and succinct summaries + +### LLMs are not so great at + +Skills in this category are ways in which LLMs struggle, and may always struggle. Solving math and computation problems via probability on natural language queries is probably not the best way to solve them. That said, improving some of these foundational skills may be something this work tackles in the future, but not at this time. + +Most skill submissions in these categories are likely to be rejected. + +For hallucinations in particular, trying to solve this with a skill is unlikely to work. Consider contributing to the Knowledge taxonomy when it opens instead to improve the model's understanding of facts. + +* Math +* Computation +* "Turing-complete" type tasks +* Generating only true real-world information (they're prone to hallucinations) diff --git a/taxonomy/docs/assets/add_skill_branch.png b/taxonomy/docs/assets/add_skill_branch.png new file mode 100644 index 000000000..e474ca212 Binary files /dev/null and b/taxonomy/docs/assets/add_skill_branch.png differ diff --git a/taxonomy/docs/assets/backend.png b/taxonomy/docs/assets/backend.png new file mode 100644 index 000000000..af82c7d18 Binary files /dev/null and b/taxonomy/docs/assets/backend.png differ diff --git a/taxonomy/docs/assets/comp_skill.png b/taxonomy/docs/assets/comp_skill.png new file mode 100644 index 000000000..ea96f1c06 Binary files /dev/null and b/taxonomy/docs/assets/comp_skill.png differ diff --git a/taxonomy/docs/assets/linguistics.png b/taxonomy/docs/assets/linguistics.png new file mode 100644 index 000000000..859a05230 Binary files /dev/null and b/taxonomy/docs/assets/linguistics.png differ diff --git a/taxonomy/docs/assets/new_file.png b/taxonomy/docs/assets/new_file.png new file mode 100644 index 000000000..08c00efe6 Binary files /dev/null and b/taxonomy/docs/assets/new_file.png differ diff --git a/taxonomy/docs/assets/qna_ex.png b/taxonomy/docs/assets/qna_ex.png new file mode 100644 index 000000000..73778ced1 Binary files /dev/null and b/taxonomy/docs/assets/qna_ex.png differ diff --git a/taxonomy/docs/assets/review-process.png b/taxonomy/docs/assets/review-process.png new file mode 100644 index 000000000..0e611ae5f Binary files /dev/null and b/taxonomy/docs/assets/review-process.png differ diff --git a/taxonomy/docs/assets/simile_folder.png b/taxonomy/docs/assets/simile_folder.png new file mode 100644 index 000000000..bfe1fae15 Binary files /dev/null and b/taxonomy/docs/assets/simile_folder.png differ diff --git a/taxonomy/docs/assets/tax_labels.png b/taxonomy/docs/assets/tax_labels.png new file mode 100644 index 000000000..2805a2899 Binary files /dev/null and b/taxonomy/docs/assets/tax_labels.png differ diff --git a/taxonomy/docs/assets/triage_schedule.png b/taxonomy/docs/assets/triage_schedule.png new file mode 100644 index 000000000..af8c00aa1 Binary files /dev/null and b/taxonomy/docs/assets/triage_schedule.png differ diff --git a/taxonomy/docs/contributing_via_GH_UI.md b/taxonomy/docs/contributing_via_GH_UI.md new file mode 100644 index 000000000..6bae44daa --- /dev/null +++ b/taxonomy/docs/contributing_via_GH_UI.md @@ -0,0 +1,63 @@ +# Contributing using the GitHub webpage UI + +There are a few ways you can create pull requests in an open source project: Locally using the git CLI tool in your terminal or using the GitHub webpage user interface. The following instructions show you how to create a PR using the GitHub webpage UI + +## Prerequisites + +- You have a GitHub account +- You have access to this repo + +## Make a copy of the taxonomy repo + +1. Go to [github.com/instructlab/taxonomy](https://github.com/instructlab/taxonomy). + +2. Click **Fork** to fork your own copy of the repo. + +3. On the **Create a new fork** page, enter the information into the following fields: + - **Repository name:** Name your fork the same as the repository, in this case `taxonomy` is the name of your fork. + - **Description:** Enter the description of _your fork_, not of the skills you will create. You can write something that makes sense to you or leave it blank. + - **Copy the main branch only:** The box is selected by default. You can choose to leave the box selected or clear it. + +4. Click **Create Fork**. + +You will get a copy of the taxonomy repo in your github account. This is your own copy, so don't worry about making mistakes. _If you do end up making a mistake and want to start over: you can delete the fork and create a new fork._ + +## Contributing a skill + +1. IMPORTANT: Before you begin, create a working branch for your contribution and name it whatever you like. This allows you to have an up-to-date `main` branch in your fork as well as a working branch to add any changes. + + ![add_branch](assets/add_skill_branch.png) + +2. Navigate to the directory where you want your knowledge or skill. The following image shows the compositional skills directory and its contents. Skills are contributed to this directory: + + ![comp_skill](assets/comp_skill.png) + + The other top-level directory you can contribute to is the knowledge directory, which is used for knowlege contributions. You can read more about the difference between skills and knowledge in the [community documentation](https://github.com/instructlab/community/blob/main/docs/README.md). + +3. Based on the directories that exist in the tree, make a best guess at where in the tree structure to add the skill that you want to contribute. If you get to a point where you've gone deep enough into the tree and you can't find any directories that match, create a new directory (and subdirectories, if needed) to best represent your skill. + + For example, I want to train the model to learn how to create sentences using rhetorical devices, specifically similes. I started in the compositional_skills section, and navigated to linguistics. + + ![linguistics_file](assets/linguistics.png) + + There is currently no rhetorical_devices folder in the linguistics section, so I can create one called "rhetorical_devices", then click `/` to create another folder called "simile" + + ![new_file](assets/new_file.png) + + ![simile_folder](assets/simile_folder.png) + +4. It can be a little tricky mechanically to create directories in GitHub's web UI, but you can complete the process using the following steps: + + a. In the GitHub repo, click the folder that you want to create the new directory inside of. + + b. Click Add File and select Create new file from the menu. + + c. Type the name of the first directory that you want to create. The example image uses rhetorical_devices as the first directory, then metaphor as the next. + + d. After you have entered the name of all of the directories that you want to add, type the file name. The file name should always be qna.yaml (qna stands for "Question aNd Answer.") + + ![qna_example](assets/qna_ex.png) + + e. You can then click "commit changes" to your branch. The GitHub UI will prompt you to open a pull requestion. Select the "open pull request" button. + +5. Verify that your YAML follows the proper structure. See [Knowledge: YAML examples](https://github.com/instructlab/taxonomy/blob/main/README.md#knowledge-yaml-examples) and [Skills: YAML examples](https://github.com/instructlab/taxonomy/blob/main/README.md#skills-yaml-examples) to help with formatting. The [yamllint](https://www.yamllint.com/) tool is another great way to verify yaml. diff --git a/taxonomy/docs/knowledge-contribution-guide.md b/taxonomy/docs/knowledge-contribution-guide.md new file mode 100644 index 000000000..cc2849018 --- /dev/null +++ b/taxonomy/docs/knowledge-contribution-guide.md @@ -0,0 +1,71 @@ +# Knowledge contribution guidelines + +You can create a Git repository to host your knowledge contributions anywhere (GitLab, Gerrit, etc.) but it may be favorable to create one on GitHub. The following instructions show you how to create a knowledge repository in GitHub and contribute to the taxonomy. + +## Prerequisites + +- You have a GitHub account +- You have a forked copy of the [taxonomy](https://github.com/instructlab/taxonomy/tree/main) repository +- Verify that the model does not already know the knowledge you want to submit + +## Creating your own knowledge repository + +To create a new GitHub repository, follow the GitHub documentation in [Creating a new repository](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-new-repository). + +The specific steps are listed as follows: + +1. In your GitHub profile page, navigate to the repositories tab. You will see a search bar where you can search your repositories, or create a new one. +2. This takes you to a page titled β€œCreate a new repository”. Create a custom name for your repository and add a README.md file. For example, β€œknowlege_contributions” could be a good name for your repository. +3. Click β€œCreate” when you are all set. + +## Convert your knowledge documentation to markdown + +There are many online tools that can help you convert your documents to markdown. If you are using a wiki page for your contributions, you can use [pandocs](https://pandoc.org/try/) to convert the documents. For wikipedia sources on pandoc, use `from: mediawiki` and convert `to: markdown_strict` to access the proper markdown format. + +## Add the markdown file to your repository + +To add a file to your GitHub repository, follow the GitHub documentation in [Adding a file to a repository](https://docs.github.com/en/repositories/working-with-files/managing-files/adding-a-file-to-a-repository). + +The specific steps are listed as follows: + +1. Navigate to β€œAdd files”. Click β€œCreate new file” if you want to manually add your markdown content. Click β€œUpload files” if you have a file locally to add. +2. Add a description and commit your changes. + + Since this is your own repository, you can commit directly to the `main` branch. + +3. You can then see your new content in your repository. + +> [!IMPORTANT] +> Make a note of your commit SHA; you need it for your `qna.yaml`. + +## Create a pull request in the taxonomy repository + +Navigate to your forked taxonomy repository and ensure it is up-to-date. + +There are a few ways you can create a pull request: + +- For details on the local process, check out [The GitHub Workflow Guide](https://github.com/kubernetes/community/blob/master/contributors/guide/github-workflow.md) in the kubernetes documentation and the [GitHub flow](https://docs.github.com/en/get-started/using-github/github-flow) in the GitHub documentation. +- For details on contributing using the GitHub webpage UI, see [Contributing using the GH UI](https://github.com/instructlab/taxonomy/docs/contributing_via_GH_UI.md) or [Creating a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request?tool=webui) in the GitHub documentation. + +## Verification + +Here are a few things to check before seeking reviews for your contribution: + +- Your `qna.yaml` follows the proper formatting. See examples in [Knowledge: YAML examples](https://github.com/instructlab/taxonomy/blob/main/README.md#knowledge-yaml-examples) +- Ensure all parameters are set. Especially the `document`, `repo`, `commit` and `pattern` keys; these parameters are specific to knowledge contributions and require more analysis. +- Include an `attribution.txt` file for citing your sources. see [For your attribution.txt file](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#for-your-attributiontxt-file) for more information. + +## PR Upstream Workflow + +The following table outlines the expected timing for the PR(s) you have put in. The PRs go through a few steps, and checks, but you should be able to map your `label` to +the place that it is in. + +| Label | Actor | Action | Duration | +| --- | --- | --- | --- | +| | Contributor | Submit PR | - | +| | Contributor | Fix failed PR checks | - | +| https://github.com/instructlab/taxonomy/labels/triage-needed | Triager | Review PR, ask for changes | Days | +| https://github.com/instructlab/taxonomy/labels/triage-requested-changes | Contributor | Make requested changes | Days | +| https://github.com/instructlab/taxonomy/labels/precheck-generate-ready | Triager | Run prechecks and generate | Days | +| https://github.com/instructlab/taxonomy/labels/community-build-ready | Backend | Model gets retrained | Weeks | +| | Triager | Check the numbers and PR merged or closed | - | diff --git a/taxonomy/docs/taxonomy_diagram.md b/taxonomy/docs/taxonomy_diagram.md new file mode 100644 index 000000000..0c0c1cea5 --- /dev/null +++ b/taxonomy/docs/taxonomy_diagram.md @@ -0,0 +1,38 @@ +## Taxonomy diagram + +> [!Note] +> This diagram shows a subset of the taxonomy. It is not a complete representation. + +```mermaid + flowchart TD; + na[not accepting contributions\n at this time]:::na + taxonomy --> foundational_skill & compositional_skills & knowledge + + foundational_skill:::na --> reasoning:::na + reasoning:::na --> common_sense_reasoning:::na + reasoning:::na --> mathematical_reasoning:::na + reasoning:::na --> theory_of_mind:::na + + compositional_skills --> engineering + compositional_skills --> grounded + compositional_skills --> lingustics + + grounded --> grounded/arts + grounded --> grounded/geography + grounded --> grounded/history + grounded --> grounded/science + + knowledge --> knowledge/arts + + knowledge --> knowledge/miscellaneous_unknown + knowledge --> knowledge/science + knowledge --> knowledge/technology + knowledge/science --> animals --> birds --> black_capped_chickadee --> black_capped_chikadee-a & black_capped_chikadee-q + knowledge/science --> astronomy --> constellations --> phoenix --> phoenix-a & phoenix-q + + black_capped_chikadee-a{attribution.txt} + black_capped_chikadee-q{qna.yaml} + phoenix-a{attribution.txt} + phoenix-q{qna.yaml} + classDef na fill:#EEE +``` diff --git a/taxonomy/docs/taxonomy_diagram.png b/taxonomy/docs/taxonomy_diagram.png new file mode 100644 index 000000000..f5f0729c4 Binary files /dev/null and b/taxonomy/docs/taxonomy_diagram.png differ diff --git a/taxonomy/docs/template_qna.yaml b/taxonomy/docs/template_qna.yaml new file mode 100644 index 000000000..efc66eb0c --- /dev/null +++ b/taxonomy/docs/template_qna.yaml @@ -0,0 +1,57 @@ +version: 3 +domain: +created_by: +seed_examples: + - context: | + + questions_and_answers: + - question: | + + answer: | + + - question: | + answer: | + - question: | + answer: | + - context: | + questions_and_answers: + - question: | + answer: | + - question: | + answer: | + - question: | + answer: | + - context: | + questions_and_answers: + - question: + answer: | + - question: | + answer: | + - question: | + answer: | + - context: | + questions_and_answers: + - question: | + answer: | + - question: | + answer: | + - context: | + questions_and_answers: + - question: | + answer: | + - question: | + answer: | + - question: | + answer: | +document_outline: | + +document: + repo: https://github.com//.git + commit: + patterns: + - .md + - .pdf + - "*.md" + - "*.pdf" + - "/*.md" + diff --git a/taxonomy/docs/triaging/insights/images/InstructLab-Insights-report.png b/taxonomy/docs/triaging/insights/images/InstructLab-Insights-report.png new file mode 100644 index 000000000..3d49e35c9 Binary files /dev/null and b/taxonomy/docs/triaging/insights/images/InstructLab-Insights-report.png differ diff --git a/taxonomy/docs/triaging/insights/images/gh-bot.png b/taxonomy/docs/triaging/insights/images/gh-bot.png new file mode 100644 index 000000000..233cdd306 Binary files /dev/null and b/taxonomy/docs/triaging/insights/images/gh-bot.png differ diff --git a/taxonomy/docs/triaging/insights/images/gh-message_analysis-in-progress.png b/taxonomy/docs/triaging/insights/images/gh-message_analysis-in-progress.png new file mode 100644 index 000000000..fc51a1b5f Binary files /dev/null and b/taxonomy/docs/triaging/insights/images/gh-message_analysis-in-progress.png differ diff --git a/taxonomy/docs/triaging/insights/images/gh-message_no-violation.png b/taxonomy/docs/triaging/insights/images/gh-message_no-violation.png new file mode 100644 index 000000000..3a79c6e1f Binary files /dev/null and b/taxonomy/docs/triaging/insights/images/gh-message_no-violation.png differ diff --git a/taxonomy/docs/triaging/insights/images/gh-message_potential-violation.png b/taxonomy/docs/triaging/insights/images/gh-message_potential-violation.png new file mode 100644 index 000000000..2b7b4ed82 Binary files /dev/null and b/taxonomy/docs/triaging/insights/images/gh-message_potential-violation.png differ diff --git a/taxonomy/docs/triaging/insights/images/gh-workflow.png b/taxonomy/docs/triaging/insights/images/gh-workflow.png new file mode 100644 index 000000000..35f63d574 Binary files /dev/null and b/taxonomy/docs/triaging/insights/images/gh-workflow.png differ diff --git a/taxonomy/docs/triaging/insights/images/login-screen.png b/taxonomy/docs/triaging/insights/images/login-screen.png new file mode 100644 index 000000000..7464a131b Binary files /dev/null and b/taxonomy/docs/triaging/insights/images/login-screen.png differ diff --git a/taxonomy/docs/triaging/insights/taxonomy-insights-documentation.md b/taxonomy/docs/triaging/insights/taxonomy-insights-documentation.md new file mode 100644 index 000000000..696a328ac --- /dev/null +++ b/taxonomy/docs/triaging/insights/taxonomy-insights-documentation.md @@ -0,0 +1,59 @@ +# InstructLab Contribution Insights + +This module automatically analyzes the taxonomy of pull request contribution files, checking for potential code of conduct violations. + +The analysis currently covers the following categories: + +- **PII**: Privacy (Personal Identifiable Information) +- **Hap**: Hate, Abuse, Profanity +- **Adult**: Sexual and erotic content + +**Note**: For knowledge contributions, analysis of referenced sources included in markdown files is out of scope for the current version but is planned for the future. + +## Automation workflow + +The insights analysis is trigerred by a Github Action on every PR creation or update. The high level flow is described the the following diagram: + +![Insight Analysis Automation using github workflow](images/gh-workflow.png) + +### Main workflow steps + +1. **PR is Opened**: The Contributor opens a Pull Request (PR). +2. **Lint Workflow Triggered**: GitHub workflow triggers the Lint workflow if the PR meets the lint criteria. +3. **Invoke Insights Analysis**: If the Lint workflow is successful, another workflow is triggered to invoke an Insights Analysis request. +4. **Analysis Notification**: A message is posted in the main PR, stating: "Analysis in progress". +5. **Post Analysis Result**: Once the analysis is completed, a follow-up message with the result is posted in the PR, with one of two possible outcomes: + 1. **No Violations Found**: No code of conduct violations detected. + 2. **Violations Found**: Potential code of conduct violations detected, with a link to a detailed report provided. + +#### GitHub messages + +#### Analysis Notification + +A message is posted in the main PR, stating: "Analysis in progress". Here is a screen capture of the message: + +![Analysis in progress](images/gh-message_analysis-in-progress.png) + +#### No Violations Found + +If no code of conduct violations is detected, a message is posted in the main PR, as shown in the screen capture below: + +![No Violations Found](images/gh-message_no-violation.png) + +#### Potential Violations to the code of conduct + +When potential violations to the code of conduct are identified, the following message is shown, and a link to a report is provided (see more below): + +![Potential Violations to the CoC](images/gh-message_potential-violation.png) + +## Insights Report Page + +When potential violations of the code of conduct are identified, a separate report link is provided. + +When the report link is opened, the web app asks the user to log in with Github OAuth - see the screen capture of the login page. + +![Login scren](images/login-screen.png) + +After login, the user sees the report that lists all the potential violations found, with each violation category highlighted in a different color. Confidence scores are also provided. See the screen capture of the report below: + +![Potential Violations to the CoC](images/InstructLab-Insights-report.png) \ No newline at end of file diff --git a/taxonomy/docs/triaging/safe-responses.md b/taxonomy/docs/triaging/safe-responses.md new file mode 100644 index 000000000..de57d55bd --- /dev/null +++ b/taxonomy/docs/triaging/safe-responses.md @@ -0,0 +1,43 @@ +# Templates of safe responses to common PR mistakes + +## Skills Content + +### Jokes and Poems + +Thank you for taking the time to submit to InstructLab! I know this wasn't easy and could have been quite time-consuming. Unfortunately, our team has decided that submissions for jokes or poems (skills and knowledge) are no longer supported or approved. We received so many at the beginning, and with jokes being "in the eye of the beholder" and puns requiring nuance for native English speakers, we realized we were possibly unconsciously biasing our model. We have discovered that working with both topics has its own challenges, and if we want something generalized, finding consensus was unsuccessful. + +We have updated the [topics to avoid](../SKILLS_GUIDE.md#avoid-these-topics) to reflect this, and if you have more questions, don't hesitate to comment on this PR. We may revisit this in the future, but we are looking for unique skills or new knowledge for the model; poems and jokes, unfortunately, don't fit this plan. + +Thank you again for taking this time, and we are looking forward to your next PR to the InstructLab project! + +### Math + +LLMs aren't great at counting, math, or computation. Right now we're not seeking to improve the model's math and computation ability as this skill belongs to ["core skill"](../SKILLS_GUIDE.md#core-skills), which is not open to contribution. + +This will be rejected for now and we can reconsider if we open a math / computation taxonomy. + +### Coding + +Thanks for the submission! Coding belongs to ["core skill"](../SKILLS_GUIDE.md#core-skills), which is not open to contribution. + +This will be rejected for now and we can reconsider if we open a programming / coding taxonomy. + +### Knowledge not Skill + +Given the content of the file, it seems that this would likely fall under knowledge, which we are currently accepting contributions for under different [guidelines](../../README.md#getting-started-with-knowledge-contributions). + +Closing this PR, but feel free to resubmit as knowledge following those guidelines! + +## Contribution Formatting Guidlines + +### Not Enough Examples + +Could you please make sure there are five examples with good diversity, meaning having more examples that have very little overlapping content? We have recently updated the [readme](../../README.md#getting-started-with-skill-contributions) with the new requirement. Thank you for your contribution! + +### Attribution + +Thanks for the submission! It looks interesting. Based on the [current guide](../../README.md#getting-started-with-skill-contributions), do you mind adding attribution to the examples? + +### Mentions of GPT, ChatGPT, Bard, or other proprietary LLMs + +Since this is a non-proprietary project, please remove all mentions of the names for commercial or proprietary LLM models; we are in favor of wordings like β€œLLMs”, or β€œFoundation Models”, etc. diff --git a/taxonomy/docs/triaging/triaging-contributions.md b/taxonomy/docs/triaging/triaging-contributions.md new file mode 100644 index 000000000..d06ff4422 --- /dev/null +++ b/taxonomy/docs/triaging/triaging-contributions.md @@ -0,0 +1,145 @@ +## Triaging contributions + +- [Triaging guide](#triaging-guide) + - [Basic review questions](#basic-review-questions) + - [Subjective review questions](#subjective-review-questions) +- [Triager responsibilities](#triager-responsibilities) + - [Labels](#labels) + - [Label workflow guide](#label-workflow-guide) +- [Triaging schedule](#triaging-schedule) + +## Triaging guide + +Before you begin, familiarize yourself with the basic concepts around skills and knowledge, as well as the file formats for compositional skills, grounded vs freeform skills, and knowledge. + +- [InstructLab Community Learning Guide](https://github.com/instructlab/community/blob/main/docs/README.md) +- [Taxonomy README](../README.md) + +Triaging is the practice of reviewing existing [skill and knowledge pull requests (PRs)](https://github.com/instructlab/taxonomy/pulls?q=is%3Apr+is%3Aopen+label%3Askill) to make sure they're relevant, actionable, and have all the information needed to be fully evaluated by both the Taxonomy Triage team (Triagers, @taxonomy-triagers) and the Taxonomy Approvers (Approvers, @taxonomy-approvers). + +Triagers review open pull requests and use labels to manage their state and any actions needed. Triagers are also encouraged to provide informative and helpful comments either back to the contributor, to other Triagers or to the Approvers. And remember to be nice. + +> [!IMPORTANT] +> Triagers DO NOT MERGE skills pull requests. This action will be done after final approval by @taxonomy-approvers + +### Basic review questions + +- Does the PR have the pull request template information filled out? + - If skill has not been run through lmdk, assign unverified label +- Did all the PR checks pass? +- Does the skill have 5 or more examples? + - **NOTE 2024-03-12:** This has been increased from 3 in the most recent guidance from the approvers! +- Make sure fields in YAML are correct + - configure tooling eventually: linting, formatting +- No PII in content (may eventually be automated) +- No toxic or hateful content (HAP - hate abuse and profanity) (may eventually be automated) +- Was response clearly generated by LLM (not easy but if very obvious) (may eventually be automated) + +### Subjective review questions + +- Is this a skill or knowledge +- Do we think that the model could actually be improved by the examples? + - Is this a skill that you can even teach an LLM +- Is the skill appropriately placed within the taxonomy? (If outside of skill directory, address the issue) + +> [!NOTE] +> The skill taxonomy structure is used in several ways: +> +> 1. Selecting the right subset of the taxonomy to use for data generation. +> 2. Interpretability by human contributors and maintainers. +> 3. As part of the prompt to the model used to generate synthetic samples. +> Therefore: Make sure the names of directories match the intent of the +> taxonomy files, perhaps also see if there's a more logical place in the +> taxonomy structure for a person's contribution to live before signing off. + +### Potential automation: (at a later date) + +- HAP filtering +- PII filtering +- sanity check: is model response similar or complete different from provided by contributor? +- sanity check: is this a skill that you can even teach an LLM? +- generation check: do the teacher-model-generated instructions actually make sense with the skill being added? + +## Triager Responsibilities + +### Labels + +There are a few labels that the triager is responsible for when reviewing PRs: + +- https://github.com/instructlab/taxonomy/labels/ci - the PR touches our CI system +- https://github.com/instructlab/taxonomy/labels/enhancement - the PR has a new feature or request +- https://github.com/instructlab/taxonomy/labels/knowledge - the PR is a knowledge contribution +- https://github.com/instructlab/taxonomy/labels/legal-hold - the PR is a good suggestion but we need legal signoff, or review +- https://github.com/instructlab/taxonomy/labels/github_actions - the PR touches our GitHub Actions configuration +- https://github.com/instructlab/taxonomy/labels/help%20wanted - extra attention is needed +- https://github.com/instructlab/taxonomy/labels/question - further information is requested +- https://github.com/instructlab/taxonomy/labels/precheck-generate-ready - The PR has passed all the linting and "code" now is in the model enagegment loop +- https://github.com/instructlab/taxonomy/labels/sdg-unsuccessful - PR failed Synthetic Data Generation +- https://github.com/instructlab/taxonomy/labels/stale - the stale-bot has marked you as stale +- https://github.com/instructlab/taxonomy/labels/skill - a skills contribution as opposed to documentation contribution or a knowledge contribution +- https://github.com/instructlab/taxonomy/labels/topic-failure - a topic that we are not accepting (leave comment on specifics) +- https://github.com/instructlab/taxonomy/labels/triage-approved - triage team has signed off + - re-assign to @taxonomy-approvers + - add comment and tag @taxonomy-approvers +- https://github.com/instructlab/taxonomy/labels/triage-follow-up - triager needs to follow up after requested changes have been made +- https://github.com/instructlab/taxonomy/labels/triage-needed - skill needs a triager to review it + - triager assigns to themself when you beginning review +- https://github.com/instructlab/taxonomy/labels/triage-requested-changes - skill has been reviewed; changes requested from contributor + - triager provides comment in PR asking for additional changes or information + - triager assigns to contributor +- https://github.com/instructlab/taxonomy/labels/triage-rejected - skill fails criteria + - add informative comment while tagging @taxonomy-approvers + - re-assign to @taxonomy-approvers +- https://github.com/instructlab/taxonomy/labels/triage-uncertain - triager is uncertain which can be for a variety of reasons + - triager stays assigned + - use comment to ask the rest of the triage team for input tagging @taxonomy-triagers + - if still uncertain + - then re-assign to @taxonomy-approvers + - triager tags @taxonomy-approvers in informative comment asking for further review from that team + +### Label Workflow guide + +![tax_label](../assets/tax_labels.png) + +### Helpful guidance for different determinations + +#### Reasons for approval + +- Generation seeds (successfully creates more instructions in a `.jsonl` file) +- Meets all criteria + +#### Reasons for needing further review + +- Needs more extensive edits +- General "I Don't Know" +- Safety tasks and skills should always be escalated to @taxonomy-approvers +- Super interesting, warrants further study + +#### Reasons for rejection + +- Submitted knowledge not a skill. For example, troubleshooting on an uncommon IBM Storage Fusion error message. +- Obvious LLM answer, blocklist. + - If you're not familiar with what ChatGPT / Bard / etc writing typically looks like, play with it a bit until you can recognize the tone and linguistic patterns. +- Couldn’t verify that the model actually lacks the skills β€” i.e. model can already answer the submitted questions well enough. +- Provide examples of model response is too short and neglected reasoning details. For example: A logical question requires multi-step reasoning to reach to the final answer. The submitted model response only gives the final answer. +- Uninformative examples. For example, not all examples match the skill requested; Or the user didn’t put three independent question/answer pairs for the skill, but mistakenly submitted three chat turns for the three questions/answer pairs. Or overly repetitive examples which do not help to clear define of the requested skill. +- Missing examples: didn’t provide desired model response for the skill. + +> [!NOTE] +> Skills triagers should try to include as much information as to why the contribution is rejected. +> +### Scrubbing data from issues and pull requests + +- Title: edit title to remove information +- Comment: simply edit or delete a comment; if info is very sensitive and needs to be fully deleted, after editing the comment, use the edit history dropdown menu in the comment to delete previous versions of the comment’s content +- Description of issue or pull request cannot be simply deleted, so follow process above to edit and delete history revisions +- Code (in pull request files): + - Do NOT close PR or delete source branch yet (**important** as this would disconnect the PR from the PR source branch and the PR's changed files view remain visible) + - Edit/delete the files on the forked branch (clone the fork, checkout the PR's branch, edit the file(s), git amend last commit or reset HEAD~n to revert last n commits, force push) + - Now close PR, delete source branch + - The original now orphaned commits can still be found, but it takes some effort and the changed files view no longer shows any of the sensitive information + - Edit any comments on the PR with sensitive info and delete the previous versions + +## Triaging schedule + +![tax_label](../assets/triage_schedule.png) diff --git a/taxonomy/foundational_skills/reasoning/common_sense_reasoning/qna.yaml b/taxonomy/foundational_skills/reasoning/common_sense_reasoning/qna.yaml new file mode 100644 index 000000000..a434c9531 --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/common_sense_reasoning/qna.yaml @@ -0,0 +1,28 @@ +created_by: IBM +seed_examples: +- answer: "The answer is 3 killers. A person comes into the room and kills one of\ + \ the 3 killers, so 2 of the 3 killers remain. However the new person, having\ + \ killed one of the killers, is now a killer. Hence there are 3 killers in the\ + \ room. \n" + question: 'There are 3 killers in a room. A new person comes into the room and kills + one of the 3 people. How many killers are in the room now? Explain the answer + step by step. + + ' +- answer: 'The answer is 4 hours. Since I am drying the shirts in the sun, it does + not matter if I dry 4 or 8 shirts, it will take the same amount of time. + + ' + question: I am drying some shirts in a wide open space in the sun. If it takes 4 + hours to dry 4 shirts, how many hours does it take to dry 8 shirts? +- answer: 'The ball is now in the box. Jack put the ball in the basket. After he left, + Jill put the ball in the box, so it is now in the box. + + ' + question: "Jack and Jill are in a room with a ball, box, and a basket. Jack put\ + \ the ball in the basket and then went to school. After Jack left, Jill put the\ + \ ball in the box and went to school. Where is the ball now? Explain step by step.\ + \ \n" +task_description: 'To teach a language model about step by step reasoning + + ' diff --git a/taxonomy/foundational_skills/reasoning/linguistics_reasoning/logical_sequence_of_words/qna.yaml b/taxonomy/foundational_skills/reasoning/linguistics_reasoning/logical_sequence_of_words/qna.yaml new file mode 100644 index 000000000..7c5fc4ec1 --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/linguistics_reasoning/logical_sequence_of_words/qna.yaml @@ -0,0 +1,62 @@ +created_by: IBM +seed_examples: +- answer: 'Here is a meaningful sequence: + + 1. Crime + + 2. Police + + 3. Judge + + 4. Judgment + + 5. Punishment + + ' + question: "Arrange the words given below in a meaningful sequence.\n1.Police\_ 2.\ + \ Punishment\_ 3. Crime\_ 4. Judge\_ 5. Judgment\n" +- answer: 'Here is a meaningful sequence: + + 1. Infection + + 2. Doctor + + 3. Consultation + + 4. Treatment + + 5. Recovery + + ' + question: 'Arrange the following words in a meaningful sequence. + + 1. Infection + + 2. Consultation + + 3. Doctor + + 4. Treatment + + 5. Recovery' +- answer: 'Here is a meaningful sequence: + + 1. Letters + + 2. Word + + 3. Phrase + + 4. Sentence + + 5. Paragraph + + ' + question: 'Arrange the words given below in a meaningful sequence. + + a.Word b.Paragraph c.Sentence d.Letters e.Phrase + + ' +task_description: 'To teach a language model about the logical sequence of words + + ' diff --git a/taxonomy/foundational_skills/reasoning/linguistics_reasoning/object_identification/qna.yaml b/taxonomy/foundational_skills/reasoning/linguistics_reasoning/object_identification/qna.yaml new file mode 100644 index 000000000..2af68552c --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/linguistics_reasoning/object_identification/qna.yaml @@ -0,0 +1,25 @@ +created_by: IBM +seed_examples: +- answer: 'The list contains two distinct colors - blue and pink. + + ' + question: 'How many colors are there in this list: four, five, twenty six, blue, + alpha, pink. + + ' +- answer: 'There are 2 human body parts: Bone and tongue. Depending on if you consider + nails to be fingernails, then you could also include them as a third body part. + + ' + question: 'How many human body parts in this list and what are they: bone, table, + moon, tongue, mercury, postulate, America, nails.' +- answer: "The following items in the list are associated with both humans and animals:\ + \ intestine, cheating. \n" + question: 'In the list which items are associated with both humans and animals: + wisdom, utility, microscope, intestine, cheating, divorce, pulling, magnetism, + space, ludo, axiom? + + ' +task_description: 'To teach a language model about Object identification + + ' diff --git a/taxonomy/foundational_skills/reasoning/linguistics_reasoning/odd_one_out/qna.yaml b/taxonomy/foundational_skills/reasoning/linguistics_reasoning/odd_one_out/qna.yaml new file mode 100644 index 000000000..009d8d285 --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/linguistics_reasoning/odd_one_out/qna.yaml @@ -0,0 +1,46 @@ +created_by: IBM +seed_examples: +- answer: 'C. important. Remaining are synonyms of each other. + + ' + question: 'Which word is the odd man out? Also give an explanation why + + A. trivial + + B. unimportant + + C. important + + D. insignificant + + ' +- answer: 'D. Book. Rest are all parts of a book. + + ' + question: 'Which word does NOT belong with the others? Also give an explanation + why + + A. index + + B. glossary + + C. chapter + + D. book' +- answer: 'A. hate. Rest are positive emotions. + + ' + question: 'Which word is the odd man out? Also give an explanation why + + A. hate + + B. fondness + + C. liking + + D. attachment + + ' +task_description: 'To teach a language model about logical odd-one-out. + + ' diff --git a/taxonomy/foundational_skills/reasoning/logical_reasoning/causal/qna.yaml b/taxonomy/foundational_skills/reasoning/logical_reasoning/causal/qna.yaml new file mode 100644 index 000000000..bb965b23a --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/logical_reasoning/causal/qna.yaml @@ -0,0 +1,26 @@ +created_by: IBM +seed_examples: +- answer: 'While days tend to be longer in the summer, just because it is not summer + doesn''t mean days are necessarily shorter. + + ' + question: 'If it is summer, then the days are longer. Are the days longer if it + is not summer ? + + ' +- answer: 'No, we cannot conclusively conclude that some cats are black based solely + on the given premises. The statement "some mammals are black" does not necessarily + guarantee that among those mammals are cats. + + ' + question: If all cats are mammals and some mammals are black, can we conclude that + some cats are black? +- answer: 'Yes, we can conclude that all squares have four sides based on the given + premises. + + ' + question: 'If all squares are rectangles and a rectangle has four sides, can we + conclude that all squares have four sides? + + ' +task_description: To teach a language model about Logical Reasoning - causal relationships diff --git a/taxonomy/foundational_skills/reasoning/logical_reasoning/general/qna.yaml b/taxonomy/foundational_skills/reasoning/logical_reasoning/general/qna.yaml new file mode 100644 index 000000000..75e761430 --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/logical_reasoning/general/qna.yaml @@ -0,0 +1,168 @@ +created_by: IBM +seed_examples: +- answer: 'If you have one bun and need to share it among 7 kids without splitting + it, each child would receive 1/7 of the bun. So the split fraction would be 1/7. + + ' + question: 'i have 7 kids and i have only one bun and i need to share it without + splitting, tell me the split fraction + + ' +- answer: 'The doctor instructs you to take one pill every half hour. So, you would + take the first pill immediately, the second pill after 30 minutes, and the third + pill after another 30 minutes, totaling one hour. After one hour, all three pills + would have been consumed. + + ' + question: If a doctor gives you three pills and tells you to take one pill every + half hour, how long would the pills last? +- answer: 'The answer to this question is still 5 minutes. + + The rate at which the widgets are produced remains constant regardless of the + number of machines. Each machine produces one widget in 5 minutes, so whether + there are 5 machines or 100 machines, it would still take 5 minutes to produce + 100 widgets. + + ' + question: 'If it takes 5 machines 5 minutes to make 5 widgets, how long would it + take 100 machines to make 100 widgets? + + ' +- answer: 'The odd one out is Moon. All of the others are planets from the solar + system, but Moon is a satillite that orbits around the Earth. + + ' + question: 'Which of the following is the odd one out? + + A) Earth + + B) Mars + + C) Jupiter + + D) Moon + + ' +- answer: "A is D's aunt. \nD is the brother of E\nE is the daughter of B, making\ + \ D the son of B\nB is the sister of A, making A the aunt of both D and E. \n" + question: A, B and C are sisters, D is the brother of E and E is the daughter of + B. How is A related to D ? +- answer: '6\. The series is dividing the the number in half at each step + + ' + question: 'Which number should come next in the series, 48, 24, 12, ......? + + ' +- answer: 'Sam doesn''t run faster than Jane as Jane runs faster than Joe and Joe + runs faster than Sam. + + ' + question: 'Jane runs faster than Joe. Joe runs faster than Sam. Does Sam run faster + than Jane? + + ' +- answer: 'When drying clothes outside, several factors can affect the drying time, + such as temperature, humidity, wind, and the number of shirts. + + If there''s sufficient space for drying shirts in parallel, drying 20 shirts could + take as long as drying 5 shirts so 4 hours. However, when there''s only sufficient + space for 5 shirts, you''ll need to dry the shirts in batches so considering that + you need to dry 20 shirts, you would need 4 batches bringing the total time to + 16 hours. + + + ' + question: You're an expert in house chores. It takes 4 hours to dry 5 shirts in + open air. How long does it take to dry 20 shirts? +- answer: 'Marc thinks the ball is in the basket as that''s where it was when he left + the room. John, on the other hand, thinks the ball is in the cupboard where he + put it before he left the room. + + ' + question: 'Marc and John are together in a room with a ball, a basket and a cupboard. + Marc puts the ball in the basket and leaves the room. John takes the ball and + puts in the cupboard and leaves the room. When they return in the evening, they + don''t know what happened to the ball after they''ve left the room. Where do they + each think the ball is? Explain your reasoning step by step. + + + + ' +- answer: 'Based on the information provided, the names of Mike''s mom''s four kids + are Luis, Drake, Matilda, and Mike. Therefore the name of the 4th kid is Mike. + + ' + question: 'Mike''s mom has 4 kids. 3 of them are Luis, Drake and Matilda. What is + the name of the 4th kid? + + ' +- answer: 'Let''s analyze the situation step by step: + + + 1\. We know that 4 cars traveled from San Francisco to San Jose, which is 41.60 + miles apart. + + 2\. Each car travels at a similar speed. Having more cars on the road will not + reduce the time that any one car takes to drive from San Francisco to San Jose. + + 3\. Therefore, the time it takes for each car to travel the distance is same, + 1 hour. + + ' + question: 4 cars left San Fransisco at 12pm and they reached San Jose, 41.60 mi + apart, at 1pm. How much time will be needed for 8 cars? Consider they have similar + speed. +- answer: 'Based on the information provided, we know that Jane has one car and that + it is in the driveway when she is at home. However, when Jane is at her office, + we know that she used her car for commuting. This implies that the car is not + in the driveway. As Jane only has 1 car, then there must be no cars in Jane''s + driveway now. Therefore the answer is 0. + + ' + question: 'Jane has 1 car. When she is at home then her car is in the driveway. + Jane is now at her office. Can you find the number of car in her driveway now? + + ' +- answer: 'C (The student''s marks is between 40 and 60.) + + ' + question: '"You are a teacher and need to give grade to the students. The grades + are defined as follows:\n\nFor marks less than 40, grade is F.\nFor marks between + 40 and 60, grade is C.\nFor marks between 60 and 80, grade is B.\nFor marks above + 80, grade is A.\n\nYou will be given a student''s marks as input and you need + to output the grade.\n\nMarks : 47\nGrade:\u00a0\n" + + ' +- answer: 'Intermediate + + ' + question: 'You are a fitness trainer designing workout plans for clients. Based + on number of hours a client exercised in a week, determine and output the appropriate + fitness level: + + + For less than 3 hours, fitness level is Beginner. + + For 3 to 5 hours, fitness level is Intermediate. + + For 5 to 7 hours, fitness level is Advanced. + + For more than 7 hours, fitness level is Expert. + + + Weekly exercise hours: 4 + + Fitness level: ' +- answer: 'Enthusiast + + ' + question: "As a travel guide, you want to assess the experiences of tourists. Based\ + \ on the number of landmarks visited by a tourist in a city, determine the travel\ + \ exploration level using the following guidelines: \n\nFor 0 to 2 landmarks,\ + \ exploration level is Casual. \nFor 3 to 5 landmarks, exploration level is Enthusiast.\ + \ \nFor 6 to 8 landmarks, exploration level is Adventurer. \nFor more than 8 landmarks,\ + \ exploration level is Explorer. \n\nNumber of landmarks visited: 4 \nExploration\ + \ level:\_\n" +task_description: 'To teach a language model about logical reasoning + + ' diff --git a/taxonomy/foundational_skills/reasoning/logical_reasoning/tabular/qna.yaml b/taxonomy/foundational_skills/reasoning/logical_reasoning/tabular/qna.yaml new file mode 100644 index 000000000..c1505e75c --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/logical_reasoning/tabular/qna.yaml @@ -0,0 +1,53 @@ +created_by: IBM +seed_examples: +- answer: "To predict the next value in the temperature sequence, we can look at the\ + \ pattern in the previous values. The differences between consecutive temperatures\ + \ are:\n\n2 (day 2 - day 1)\n3 (day 3 - day 2)\n3 (day 4 - day 3)\n4 (day 5 -\ + \ day 4)\n\nThis increasing difference suggests that the temperature will continue\ + \ to increase. To find the next temperature, we can add the difference from day\ + \ 4 to the temperature of day 4:\n\nNext temperature = Day 4 temperature + Difference\n\ + = 28 \xB0C + 3 \xB0C\n= 31 \xB0C\n\nTherefore, the next value in the sequence\ + \ is 31 degrees Celsius.\n" + question: "Given the following table, predict the next value in the sequence:\n\n\ + Day\tTemperature (\xB0C)\n1\t20\n2\t22\n3\t25\n4\t28\n5\t?\n\n" +- answer: 'North: Product A + + South: Product C + + East: Product A + + West: Product C + + ' + question: "You're an analyst examining the performance of a retail company's product\ + \ lines across different regions. Analyze the sales data provided in the table\ + \ below and identify the best-performing product line in each region. For example\ + \ the best performing product in North is Product B. Similarly identify for other\ + \ regions as well.\n\nRegion\tProduct A Sales (in $)\tProduct B Sales (in $)\t\ + Product C Sales (in $)\nNorth\t$10,000\t $12,000\t\ + \ $9,000\nSouth $8,000\t \ + \ $10,000\t $11,000\nEast\t $11,000\t\ + \ $9,000\t $10,000\n\ + West\t$9,000\t $11,000\t \ + \ $12,000" +- answer: 'To calculate the total number of fruits bought on each day we first sum + up the number of fruits bought in each day. + + Total fruits for A = 5 (Day 1) + 8 (Day 2) + 4 (Day 3) = 17 + + Total fruits for B = 3 (Day 1) + 6 (Day 2) + 7 (Day 3) = 16 + + Total fruits for C = 2 (Day 1) + 5 (Day 2) + 6 (Day 3) = 13 + + + Hence on Day 3 the lowest number of fruits were bought. + + ' + question: "Consider the following table representing the number of fruits bought\ + \ by three different people (A, B, C) at a fruit stall over three days (Day 1,\ + \ Day 2, Day 3):\n\nPerson\tDay 1\tDay 2\tDay 3\nA\t5\t8\t4\nB\t3\t6\t7\nC\t2\t\ + 5\t6\nOn which day lowest number of fruits were brought?\n" +task_description: 'To teach a language model about logical reasoning using tabular + data + + ' diff --git a/taxonomy/foundational_skills/reasoning/mathematical_reasoning/qna.yaml b/taxonomy/foundational_skills/reasoning/mathematical_reasoning/qna.yaml new file mode 100644 index 000000000..40913127d --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/mathematical_reasoning/qna.yaml @@ -0,0 +1,26 @@ +created_by: IBM +seed_examples: +- answer: 'A kilo of feathers is heavier than a pound of hammers. 1 kilo is equal + to ~2.2046 pounds, so no matter what you are weighing 1 kilo will be heavier than + 1 pound + + ' + question: 'What is heavier? A kilo of feathers, or a pound of hammers? + + ' +- answer: "100 servers. If 1 Administrator can manage 10 servers, and we increase\ + \ that number by 10, we can assume that 10 administrators could handle 10 times\ + \ the number of servers 10x10=100 \n" + question: '10 servers require 1 administrator to manage. + + How many servers can I manage with 10 administrators?' +- answer: 'You need either 1 x 2 socket subscription, or 16 x 2-core subscriptions. + + ' + question: 'I have a server with 2 sockets, with 16 cores each. How many OpenShift + subscriptions do I need to purchase, with either 2 socket pair or 2 core pair + licensing model? + + + ' +task_description: "To teach a language model about mathematical operations. \n" diff --git a/taxonomy/foundational_skills/reasoning/temporal_reasoning/qna.yaml b/taxonomy/foundational_skills/reasoning/temporal_reasoning/qna.yaml new file mode 100644 index 000000000..00adb823a --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/temporal_reasoning/qna.yaml @@ -0,0 +1,68 @@ +created_by: IBM +seed_examples: +- answer: 'Hello! I''m here to help you understand the order of arrival at a party. + Let''s analyze the situation step by step: + + + 1. Joe reached first (before you). + + 2. You arrived next. + + 3. David arrived after you. + + 4. John reached last (after David). + + + Based on this information, Joe was the first one to arrive at the party, followed + by you, then David, and finally John. + + ' + question: 'Consider I went to a party. I reached before John. David arrived after. + Joe reached before me. John arrived after David. Who reached first and who reached + last? + + ' +- answer: 'Hello! I''m here to help you understand the order of arrival at a celebration. + Let''s analyze the situation step by step: + + + 1. Emma reached first (before Peter). + + 2. Peter arrived next. + + 3. Sophia arrived after Peter. + + 4. Olivia reached last (after Sophia). + + + Based on this information, Emma was the first one to arrive at the celebration, + followed by Peter, then Sophia, and finally Olivia. + + ' + question: At a celebration, Peter came in before Olivia. Sophia arrived after. Emma + got there before Peter. Olivia arrived after Sophia. Who reached first, and who + reached last? +- answer: 'Hello! I''m here to help you understand the order of photos in a photography + competition. Let''s analyze the situation step by step: + + + 1. Emily captured her photo first (before James). + + 2. James captured a photo next. + + 3. Alex entered with a photo after James. + + 4. Sarah submitted her photo last (after Alex). + + + Based on this information, Emily''s photo was captured first, followed by Jame''s + entry, then Alex''s photo, and finally Sarah''s submission. + + + ' + question: 'In a photography competition, James captured a photo before Sarah. Alex''s + entry came after. Emily took a photo before James. Sarah submitted her photo after + Alex. Whose photo was captured first, and whose photo was captured last? + + ' +task_description: "To teach a language model about chronological order of time. \n" diff --git a/taxonomy/foundational_skills/reasoning/theory_of_mind/qna.yaml b/taxonomy/foundational_skills/reasoning/theory_of_mind/qna.yaml new file mode 100644 index 000000000..e3aca793f --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/theory_of_mind/qna.yaml @@ -0,0 +1,84 @@ +created_by: IBM +seed_examples: +- answer: "Sarah placed her shoes under her dress before she went shopping. Her sister\ + \ borrowed the shoes and put them under Sarah\u2019s bed. Sarah doesn\u2019t know\ + \ that her sister borrowed her shoes. Therefore, the answer is yes, she assumes\ + \ that the shoes are still under her dress.\n" + question: "Read the scenario and answer the following question: Scenario: \"The\ + \ morning of the high school dance Sarah placed her high heel shoes under her\ + \ dress and then went shopping. That afternoon, her sister borrowed the shoes\ + \ and later put them under Sarah\u2019s bed.\" Question: When Sarah gets ready,\ + \ does she assume her shoes are under her dress?\n" +- answer: 'When the Garcia family left, the game was tied, 3-3, and they expected + the game to be postponed due to the rain. So, they don''t know that the game ended + during their commute with a score of 5-3. Therefore, the answer is no, they don''t + believe the score is 5-3. + + ' + question: 'Read the scenario and answer the following question: Scenario: "Expecting + the game to be postponed because of the rain, the Garcia family took the subway + home. The score was tied, 3-3. During their commute the rain stopped, and the + game soon ended with a score of 5- 3." Question: When the Garcia family arrives + home, do they believe the score is 5-3?' +- answer: 'When Lisa left Jacob, he was asleep on the beach. So, Lisa doesn''t know + that Jacob woke up and went swimming. Therefore, the answer is yes, Lisa believes + that Jacob is still asleep on the beach. + + ' + question: 'Read the scenario and answer the following question: Scenario: "When + Lisa left Jacob, he was deep asleep on the beach. A few minutes later a wave woke + him. Seeing Lisa was gone, Jacob decided to go swimming." Question: Does Lisa + now believe that Jacob is asleep? + + ' +- answer: 'Since Pam has been at work all day, she doesn''t know that the tulips in + her backyard have suddenly bloomed. Therefore, the answer is no, when driving + home she doesn''t assume that her tulips have bloomed. + + ' + question: "Read the scenario and answer the following question: Scenario: \"The\ + \ weather was so warm today that all the tulips in Pam\u2019s backyard suddenly\ + \ bloomed. The tulips next to Pam\u2019s office still have not yet flowered, though.\ + \ Pam has been at work all day.\" Question: When Pam is driving home after work,\ + \ does she assume her tulips have bloomed?\n" +- answer: 'The scenario mentions that Jeff put on a pink shirt instead of a white + one. This means he wanted to put on a white shirt but since he is colorblind and + can''t tell the difference, he mistakenly put on a pink shirt. He doesn''t know + we is wearing a pink short. Therefore, the answer is yes, he think he is wearing + a white shirt. + + ' + question: "Read the scenario and answer the following question: Scenario: \"When\ + \ Jeff got ready this morning, he put on a light pink shirt instead of a white\ + \ one. Jeff is colorblind, so he can\u2019t tell the difference between subtle\ + \ shades of color.\" Question: Does Jeff believe he is wearing a white shirt?" +- answer: 'The scenario mentions that the lab was empty that night. Therefore, the + answer is yes, the number of people in the lab that night is zero. + + ' + question: 'Read the scenario and answer the following question: Scenario: "To detect + intruders, the lab uses an automated system for recording voices. In the empty + lab one night, a computer error occurs and a synthetic voice reads the error message." + Question: Is the number of people in the lab that night zero? + + ' +- answer: "The scenario mentions that the country\u2019s economic success were recorded\ + \ in books from the early 1900s. Therefore, the answer is yes, early 1900s novels\ + \ portray the country as experiencing economic wealth.\n" + question: "Read the scenario and answer the following question: Scenario: \"Accounts\ + \ of the country\u2019s economic success were recorded in books from the early\ + \ 1900s. Soon after, a horrible plague hit the country, and the country was sent\ + \ into an economic depression.\" Question: Do early 1900s novels portray the country\ + \ as experiencing economic wealth?" +- answer: 'The scenario mentions that a part of garden is reserved for roses and it + has a label showing that. Therefore, the answer is yes, the label say the flowers + are roses. + + ' + question: "Read the scenario and answer the following question: Scenario: \"Part\ + \ of the garden is supposed to be reserved for the roses; it\u2019s labeled accordingly.\ + \ Recently the garden has run wild, and dandelions have taken over the entire\ + \ flower bed.\" Question: Does the label say these flowers are roses?\n" +task_description: 'To teach a language model about theory-of-mind reasoning + + ' diff --git a/taxonomy/foundational_skills/reasoning/unconventional_reasoning/lower_score_wins/qna.yaml b/taxonomy/foundational_skills/reasoning/unconventional_reasoning/lower_score_wins/qna.yaml new file mode 100644 index 000000000..3394737b8 --- /dev/null +++ b/taxonomy/foundational_skills/reasoning/unconventional_reasoning/lower_score_wins/qna.yaml @@ -0,0 +1,29 @@ +created_by: IBM +seed_examples: +- answer: 'No that is incorrect, In golf, the objective is to score as few points + as possible through various methods such as driving, chipping, and putting. The + player with the lowest score at the end of the round wins. + + ' + question: 'Part of golf is trying to get a higher point total than others. Yes or + No? + + ' +- answer: "In rummy, the player with the lowest score wins. \n" + question: In the card game rummy does the player with the highest score or the lowest + score win. +- answer: 'In most running competitions, including marathons, the winner is the runner + who completes the racecourse in the shortest amount of time. So, in this case, + the runner with a time of 4 hours and 20 minutes would win because they completed + the race faster than the runner with a time of 4 hours and 30 minutes. + + ' + question: 'one runner has a time of 4h 30 and another a time of 4h 20, which one + wins? + + ' +task_description: 'To teach a language model about how different activities have + different performance measures and lower scores win in many sports, eg running, + golf and shooting. + + ' diff --git a/taxonomy/governance.md b/taxonomy/governance.md new file mode 100644 index 000000000..e2c377b77 --- /dev/null +++ b/taxonomy/governance.md @@ -0,0 +1,3 @@ +# InstructLab Governance + +For information about how the InstructLab project governance operates, see [InstructLab Governance](https://github.com/instructlab/community/blob/main/GOVERNANCE.md). diff --git a/taxonomy/knowledge/arts/.gitignore b/taxonomy/knowledge/arts/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/arts/music/fandom/swifties/attribution.txt b/taxonomy/knowledge/arts/music/fandom/swifties/attribution.txt new file mode 100644 index 000000000..930bf69b1 --- /dev/null +++ b/taxonomy/knowledge/arts/music/fandom/swifties/attribution.txt @@ -0,0 +1,5 @@ +Title of work: Swifties +Link to work: https://en.wikipedia.org/wiki/Swifties +Revision: https://en.wikipedia.org/w/index.php?title=Swifties&oldid=1221036610 +License of the work: CC-BY-SA-4.0 +Creator names: Wikipedia Authors \ No newline at end of file diff --git a/taxonomy/knowledge/arts/music/fandom/swifties/qna.yaml b/taxonomy/knowledge/arts/music/fandom/swifties/qna.yaml new file mode 100644 index 000000000..e0ae82f58 --- /dev/null +++ b/taxonomy/knowledge/arts/music/fandom/swifties/qna.yaml @@ -0,0 +1,173 @@ +version: 3 +domain: Fandom +created_by: mairin +seed_examples: + - context: | + Critics have opined that Swift has redefined artist–fandom relationships + by establishing an intimate connection with Swifties. She has frequently + engaged with, helped, credited and prioritized her fans, who have + offered unprecedented support and interest in her works irrespective of + her wavering reception in the media. They continued to support Swift + through her genre transitions, unexpected artistic pivots, and her + highly publicized controversies such as the 2019 masters dispute, while + instigating the political scrutiny of Ticketmaster that led to + implementation of various laws and stimulated economic growth with the + Eras Tour. Swift's releases, promotional efforts, and fashion have + garnered attention for incorporating Easter eggs and clues that are + decoded by Swifties and considered part of her musical universe. + + Cultural analyses have variably described Swifties as a community of + interest, a subculture, and a near-metaverse, while academics have + studied them for their consumerism, content creation, social capital, + collective effervescence, organizing prolificacy, and interpersonal + relationships. The word Swiftie(s) was added to the Oxford Dictionary of + English in 2023. Swifties have also been a subject of criticism, with + some fans displaying disregard for Swift's privacy by publicizing her + real-time locations and engaging in verbal attack of individuals, + including celebrities, who malign Swift. On the other hand, some + Swifties criticize Swift for her lifestyle and professional choices, + which journalists disapprove as a parasocial relationship. + questions_and_answers: + - question: | + Is Swiftie a real word? + answer: | + Yes, Swiftie is a real word. Swiftie was added to the Oxford + Dictionary of English in 2023. + - question: | + What do Swifties think about Ticketmaster? + answer: | + Swifties supported Taylor Swift in scrutinizing Ticketmaster's + practices and helped advocate for various laws now in place to + help address problems with those practices. + - question: | + How did Swifties come about? + answer: | + Taylor Swift has cultivated Swifties as a group by establishing + intimate connections with her fans, redefining artist-fandom + relationships. + - context: | + The word "Swiftie" for a Swift fan gained popularity in the late 2000s. + Etymologically, the word is formed from Swift's name and the suffix + "ie", which is often used in diminutives to imply affection. Swift + stated in a 2012 Vevo interview that her fans call themselves + "Swifties", which she found "adorable". Swift filed the term for + trademark in March 2017. In 2023, Oxford Dictionary of English defined + Swiftie as a noun meaning "an enthusiastic fan of the singer Taylor + Swift." As per the dictionary, some words that collocate with Swiftie in + popular usage are "fandom", "die-hard", "hardcore" and "self- + proclaimed". According to Dictionary.com, the term Swiftie often implies + that the person is "a very passionate and loyal fanβ€”as opposed to just a + casual listener." + + questions_and_answers: + - question: | + How does someone become a Swiftie? + answer: | + Swifties are "self-proclaimed." + - question: | + What are the requirements to be considered a Swiftie? + answer: | + Swifties are die-hard, hardcore fans who are enthusiastic, + passionate, and loyal to Taylor Swift. They are not casual + listeners. To be considered a Swiftie, you must embody + these qualities. + - question: | + How does Taylor Swift feel about the term "Swifties"? + answer: | + Taylor Swift has said she finds the term Swifties to be + "adorable", which she said in a 2012 Vevo interview. + - context: | + The nicknames generally used by Swifties for Swift include "Blondie" + (referencing her blonde hair), "T-Swizzle" (after the lyrics in Swift's + 2009 parody song "Thug Story" with American record producer T-Pain), and + "The Music Industry" (referring to her influence on the music industry). + Chinese Swifties dubbed her "Meimei" (ιœ‰ιœ‰), a pun based on the Chinese + character "Mei" (ιœ‰) for "unlucky" that has since been adopted by Chinese + state media. + questions_and_answers: + - question: | + What are some nicknames for Taylor Swift? + answer: | + "Blondie", in reference to her blonde hair; "T-Swizzle", after lyrics + in a 2009 parody song called "Thug Story"; "Meimei" (ιœ‰ιœ‰), a pun + based on the Chinese character "Mei" (ιœ‰). + - question: | + What do Chinese Swifties call Taylor Swift? + answer: | + Swifties in China call Taylor Swift "Meimei" (ιœ‰ιœ‰), which is a pun + based on the Chinese character "Mei" (ιœ‰), which means "unlucky." + - question: | + Why is Taylor Swift sometimes called "T-Swizzle"? + answer: | + "T-Swizzle" is a reference to a song called "Thug Story" that Taylor + Swift released in 2009 with T-Pain. + - context: | + Journalists describe Swift's works, celebrity, and the fanfare + surrounding them as a world of its own, dubbing it a music "universe" + subject to analyses by Swifties. Propagated by her prolific use of + Easter eggs and "unusually close connection with her fans", Swift is a + source of myth in popular culture. Her outfits, accessories, diction, + color coding, and numerology have also been Easter eggs. Swifties are + known for their fan theories, having gained a reputation as "the best + online sleuths" for breaking down and associating various elements they + consider as clues or Easter eggs. According to Bruce Arthur of Toronto + Star, "Swift is followed by fans whose dedication to her mythology is + Byzantine and layered and complex and messianic." + questions_and_answers: + - question: | + What are some different elements for Swiftie fan theories about + Taylor Swift? + answer: | + Swiftie fan theories include analyses of Taylor Swift's outfits, + accessories, diction, color coding, numeroology, and Easter Eggs. + - question: | + What American pop singer fanbase are known as "the best online + sleuths?" + answer: | + Swifties, the fan base for Taylor Swift, are known as "the best + online sleuths," in part due to how they develop rich fan theories + based on signals they interpret from Swift herself. + - question: | + What does Taylor Swift have to do with mythology? + answer: | + The devotion of Swifties towards analyzing various attributes and + signals they interpret as coming from Taylor Swift has been + referenced as "dedication to her mythology [which] is Byzantine + and layered and complex and messianic," by Bruce Arthur of the + Toronto Star. + - context: | + Swift is known for her album rollouts and promotional concepts, often + referred to as "eras". Each era is characterized by an aesthetic idea, + color palette, mood, and a fashion style. As such, Swift has reinvented + her image and style throughout her career, which Ashley Lutz of Fortune + felt aided in broadening her fanbase. Today senior editor Elena Nicolaou + reported on how Swifties, who are mostly millennials, have incorporated + Swiftie culture into their weddings and other events. + questions_and_answers: + - question: | + What is a Taylor Swift "era"? + answer: | + "Era" in Swiftie parlance refers to Taylor Swift's album rollouts + and promotional concepts, characterized by specific aesthetic ideas, + color palettes, moods, and fashion styles. + - question: | + What is Taylor Swift's image and style like? + answer: | + Taylor Swift is known for reinventing her image and style throughout + her career as part of her various "eras," different aesthetic + concepts reflected in her albums and promotions. + - question: | + What do Swifties have to do with weddings? + answer: | + Taylor Swift's millenial fanbase often incorporate Swiftie culture + into their weddings and other important life milestone events, + according to Today senior editor Elena Nicolaou. +document_outline: | + Overview of Swifties, explaining what they are, their behaviors, how + they came to be, and their relationship with Taylor Swift. + +document: + repo: https://github.com/juliadenham/Summit_knowledge.git + commit: 5f7158a5ce83c4ff493bfe341fe31ecad64ff697 + patterns: + - swifties.md diff --git a/taxonomy/knowledge/engineering/.gitignore b/taxonomy/knowledge/engineering/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/geography/.gitignore b/taxonomy/knowledge/geography/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/history/.gitignore b/taxonomy/knowledge/history/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/history/biography/egypt/hikmat_abu_zayd/attribution.txt b/taxonomy/knowledge/history/biography/egypt/hikmat_abu_zayd/attribution.txt new file mode 100644 index 000000000..893a3ade6 --- /dev/null +++ b/taxonomy/knowledge/history/biography/egypt/hikmat_abu_zayd/attribution.txt @@ -0,0 +1,4 @@ +Source: https://en.wikipedia.org/wiki/Hikmat_Abu_Zayd +License: Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0) +Revision: Retrieved on 23 June 2025 + diff --git a/taxonomy/knowledge/history/biography/egypt/hikmat_abu_zayd/qna.yaml b/taxonomy/knowledge/history/biography/egypt/hikmat_abu_zayd/qna.yaml new file mode 100644 index 000000000..0d4d7c48f --- /dev/null +++ b/taxonomy/knowledge/history/biography/egypt/hikmat_abu_zayd/qna.yaml @@ -0,0 +1,52 @@ +version: 3 +metadata: + category: history/biography/egypt/hikmat_abu_zayd + name: hikmat_abu_zayd + language: en + description: Q&A about the life and legacy of Egypt's first female cabinet minister, Hikmat Abu Zayd. + +seed_examples: + - context: Hikmat Abu Zayd was Egypt’s first female cabinet minister, serving as Minister of Social Affairs from 1962 to 1965 under President Gamal Abdel Nasser. + qna: + - question: Who was Egypt’s first female cabinet minister? + answer: Hikmat Abu Zayd + - question: What position did Hikmat Abu Zayd hold in the Egyptian government? + answer: She served as Minister of Social Affairs. + - question: When did Hikmat Abu Zayd serve as Minister of Social Affairs? + answer: From 1962 to 1965. + + - context: Hikmat Abu Zayd was born in 1922 in Asyut Governorate, Upper Egypt. She studied at Cairo University and later at the University of London and St Andrews. + qna: + - question: Where was Hikmat Abu Zayd born? + answer: In Asyut Governorate, Upper Egypt. + - question: Which universities did Hikmat Abu Zayd attend? + answer: Cairo University, University of London, and St Andrews. + - question: What was her field of study? + answer: History and social sciences. + + - context: Abu Zayd was known for her commitment to social justice and her policies to support widows, orphans, and the poor. + qna: + - question: What causes was Hikmat Abu Zayd known for? + answer: Social justice and support for widows, orphans, and the poor. + - question: What was her reputation as Minister? + answer: She was known as the β€œminister of the poor.” + - question: Who did Hikmat Abu Zayd aim to help through her policies? + answer: The marginalized, including widows and orphans. + + - context: After Nasser’s death, Hikmat Abu Zayd opposed Anwar Sadat’s policies and lived in exile in the UK for several years. + qna: + - question: Why did Hikmat Abu Zayd go into exile? + answer: She opposed Anwar Sadat’s policies. + - question: Where did she live during her exile? + answer: In the United Kingdom. + - question: When did she return to Egypt? + answer: She returned later in life but was no longer politically active. + + - context: Hikmat Abu Zayd passed away in Cairo on July 30, 2011. She is remembered as a trailblazer for Egyptian women in politics. + qna: + - question: When did Hikmat Abu Zayd die? + answer: On July 30, 2011. + - question: Where did she pass away? + answer: In Cairo, Egypt. + - question: How is she remembered? + answer: As a trailblazer for women in Egyptian politics. diff --git a/taxonomy/knowledge/linguistics/.gitignore b/taxonomy/knowledge/linguistics/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/mathematics/.gitignore b/taxonomy/knowledge/mathematics/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/miscellaneous_unknown/.gitignore b/taxonomy/knowledge/miscellaneous_unknown/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/philosophy/.gitignore b/taxonomy/knowledge/philosophy/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/religion/.gitignore b/taxonomy/knowledge/religion/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/science/.gitignore b/taxonomy/knowledge/science/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/knowledge/science/animals/birds/black_capped_chickadee/attribution.txt b/taxonomy/knowledge/science/animals/birds/black_capped_chickadee/attribution.txt new file mode 100644 index 000000000..6e2b427d4 --- /dev/null +++ b/taxonomy/knowledge/science/animals/birds/black_capped_chickadee/attribution.txt @@ -0,0 +1,5 @@ +Title of work: Black-capped chickadee +Link to work: https://en.wikipedia.org/wiki/Black-capped_chickadee +Revision: https://en.wikipedia.org/w/index.php?title=Black-capped_chickadee&oldid=1219937468 +License of the work: CC-BY-SA-4.0 +Creator names: Wikipedia Authors diff --git a/taxonomy/knowledge/science/animals/birds/black_capped_chickadee/qna.yaml b/taxonomy/knowledge/science/animals/birds/black_capped_chickadee/qna.yaml new file mode 100644 index 000000000..ecfb1a004 --- /dev/null +++ b/taxonomy/knowledge/science/animals/birds/black_capped_chickadee/qna.yaml @@ -0,0 +1,189 @@ +created_by: juliadenham +version: 3 +domain: animals +seed_examples: + - context: | + The **black-capped chickadee** (***Poecile atricapillus***) is a small, + nonmigratory, North American [passerine](passerine "wikilink") bird that + lives in deciduous and mixed forests. It is a member of the + [Paridae](Paridae "wikilink") [family](family_(biology) "wikilink"), + also known as tits. It has a distinct black cap on its head, a black bib + underneath, and white cheeks. It has a white belly, buff sides, and grey + wings, back, and tail. The bird is well known for its vocalizations, + including its *fee-bee* call and its *chick-a-dee-dee-dee* call, from + which it derives its name. + + The black-capped chickadee is widely distributed throughout North + America, ranging from the northern United States to southern Canada and + all the way up to Alaska and [Yukon](Yukon "wikilink"). It feeds + primarily on insects and seeds, and is known for its ability to + [cache](Hoarding_(animal_behavior) "wikilink") food for use during the + winter. The [hippocampus](hippocampus "wikilink") of the black-capped + chickadee grows during the caching season, which is believed to help it + better remember its cache locations. The black-capped chickadee is a + social bird and forms strict [dominance + hierarchies](Dominance_hierarchy "wikilink") within its flock. During + the winter, these flocks include other bird species. It has the ability + to lower its body temperature during cold winter nights, allowing it to + conserve energy. + questions_and_answers: + - question: | + Where do black-capped chickadees live? + answer: | + The black-capped chickadee (Poecile atricapillus) is a small, + nonmigratory, North American passerine bird that lives in deciduous + and mixed forests. + - question: | + What sound does a black-capped chickadee make? + answer: | + The black-capped chickadee is well known for its vocalizations, + including its fee-bee call and its chick-a-dee-dee-dee call, from + which it derives its name. + - question: | + What does a black-capped chickadee eat? + answer: | + Black-capped chickadees feed primarily on insects and seeds, and is + known for its ability to cache food for use during the winter. + - context: | + The black-capped chickadee has a black cap and "bib" with white cheeks. + Its underparts are white with buff-colored flanks. Its back is + unstreaked and greenish gray, and the tail and wings are slate gray. It + has a black [beak](beak "wikilink") and legs, and dark brown + [irises](iris_(anatomy) "wikilink"). Males and females are generally + similar, although males have a larger bib. They can also be + distinguished based on a combination of weight and tail length. + [Tarsus](Bird_feet_and_legs "wikilink") length does not significantly + differentiate sexes. Males have a wing length of 63.5–67.5 mm (2.50–2.66 in), + a tail length of 58–63 mm (2.3–2.5 in), a bill length of 8–9.5 mm (0.31–0.37 in), + and a tarsus length of 16–17 mm (0.63–0.67 in). Females are on average slightly + smaller, with a wing length of 60.5–66.5 mm (2.38–2.62 in), a tail length of + 56.3–63 mm (2.22–2.48 in), a bill length of 9–9.5 mm (0.35–0.37 in), and a + tarsus length of 16–17 mm (0.63–0.67 in). Both sexes weigh 10–14 g (0.35–0.49 oz). + Juveniles are visually similar to adults but with fluffier plumage. + questions_and_answers: + - question: | + What color cheeks does the black capped chickadee have? + answer: | + The black-capped chickadee has a black cap and "bib" with white + cheeks. + - question: | + Do male black capped chickadees have a larger bib than the females? + answer: | + Yes, males and females are generally similar, although males have a + larger bib. + - question: | + How can you tell that a black capped chickadee is a juvenile? + answer: | + Juveniles are visually similar to adults but with fluffier plumage. + - context: | + One of the most recognizable sounds produced, particularly by the males, + is the two-note *fee-bee* song. It is a simple, clear whistle of two + notes, identical in rhythm, the first roughly a + [whole-step](Major_second "wikilink") above the second. The + frequency of their songs will typically start around + 400 [Hz](Hertz "wikilink"), and various tones spanning roughly 1 kHz + will be sung within the song. A decrease of roughly 200 Hz occurs when + the first note (*fee*) is sung, and then another decrease around 400 Hz + takes place between the end of *fee* and the beginning of *bee*. In + spite of these multiple changes in frequency, though, anybody listening + to the song only hears a pure, high-frequency tone. This is + distinguished from the Carolina chickadee's four-note call *fee-bee + fee-bay*; the lower notes are nearly identical but the higher *fee* + notes are omitted, making the black-capped song like *bee bay*. The + males sing the song only in relative isolation from other chickadees + (including their mates). In late summer, some young birds sing only a + single note. + questions_and_answers: + - question: | + What is the black-capped chickadee's fee-bee song? + answer: | + It is a simple, clear whistle of two notes, identical in rhythm, + the first roughly a whole-step above the second. + - question: | + What is the frequency of the black-capped chickadee's fee-bee + song? + answer: | + The frequency of their songs will typically start around 400 Hz, + and various tones spanning roughly 1 kHz will be sung within the + song. A decrease of roughly 200 Hz occurs when the first note (fee) + is sung, and then another decrease around 400 Hz takes place between + the end of fee and the beginning of bee. + - question: | + Do male black-capped chickadees sing their fee-bee song with other + chickadees around? + answer: | + No, the males sing the song only in relative isolation from other + chickadees (including their mates). + - context: | + Insects (especially caterpillars) form a large part of their diet in + summer. The birds hop along tree branches searching for food, sometimes + hanging upside down or hovering; they may make short flights to catch + insects in the air. Seeds and berries become more important in winter, + though insect eggs and pupae are eaten when available. Black-capped + chickadees have also been known to eat the fat off of dead mammals. + Sunflower seeds are readily taken from bird feeders. The birds take a + seed in their beak and commonly fly from the feeder to a tree, where + they proceed to hammer the seed on a branch to open it. + + Like many other species in the family Paridae, black-capped chickadees + commonly [cache](Hoarding_(animal_behavior) "wikilink") food, mostly + seeds, but sometimes insects, also. Items are stored singly in + various sites such as bark, dead leaves, clusters of conifer needles, or + knothole. Memory for the location of caches can last up to 28 days. + Within the first 24 hours, the birds can even remember the relative + quality of the stored items. + questions_and_answers: + - question: What do black-capped chickadees eat? + answer: | + Insects (especially caterpillars) form a large part of their diet + in summer. Seeds and berries become more important in winter, + though insect eggs and pupae are eaten when available. + - question: How do black-capped chickadees eat seeds? + answer: | + Black-capped chickadees take a seed in their beak and commonly + fly from the feeder to a tree, where they proceed to hammer the + seed on a branch to open it. + - question: | + How long can black-capped chickadees remember where they cached + their food? + answer: | + Memory for the location of caches can last up to 28 days. + - context: | + Black-capped chickadees are primarily subject to predation by [birds of + prey](birds_of_prey "wikilink"), including owls, hawks, and shrikes. + Nest-predation also occurs, primarily by raccoons, squirrels, opossums, + and snakes. Nest sites are also sometimes raided by [house + wrens](house_wren "wikilink"), who will destroy chickadee eggs in order + to reuse the location for their nest. + + Like many birds, black-capped chickadees are susceptible to [West Nile + Virus](West_Nile_Virus "wikilink"). They are also known to be affected + by blood parasites, including those that cause + [malaria](malaria "wikilink"), but particularly high rates of infection + have not been detected. Black-capped chickadees are also known to + be affected by [avian keratin + disorder](avian_keratin_disorder "wikilink"). + questions_and_answers: + - question: | + What birds of prey eat black-capped chickadees? + answer: | + Black-capped chickadees are primarily subject to predation by birds + of prey, including owls, hawks, and shrikes. + - question: | + What is a big danger to black-capped chickadee nests? + answer: | + Nest sites are also sometimes raided by house wrens, who will + destroy chickadee eggs in order to reuse the location for their nest. + - question: | + Do black-capped chickadees carry the West Nile virus? + answer: | + Like many birds, black-capped chickadees are susceptible to West + Nile Virus. +document_outline: | + Information about the black capped chickadee including distribution, appearance, + vocalization, diet, and predators. +document: + repo: https://github.com/juliadenham/Summit_knowledge + commit: 5f7158a5ce83c4ff493bfe341fe31ecad64ff697 + patterns: + - chickadee.md diff --git a/taxonomy/knowledge/technology/.gitignore b/taxonomy/knowledge/technology/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/taxonomy/scripts/check-yaml.py b/taxonomy/scripts/check-yaml.py new file mode 100755 index 000000000..16de656f9 --- /dev/null +++ b/taxonomy/scripts/check-yaml.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 + +# Standard +import argparse +import os +import pathlib +import sys + +# Third Party +from instructlab.schema.taxonomy import TaxonomyParser + + +class CheckYaml: + def __init__( + self, + *, + yaml_files: list[pathlib.Path], + taxonomy_folders: list[str] | None = None, + yamllint_config: str | None = None, + schema_version: int | None = None, + message_format: str | None = None, + ) -> None: + self.yaml_files = yaml_files + self.taxonomy_folders = taxonomy_folders + self.yamllint_config = yamllint_config + self.schema_version = schema_version + self.message_format = message_format + + def check(self) -> int: + exit_code: int = 0 + parser = TaxonomyParser( + taxonomy_folders=self.taxonomy_folders, + schema_version=self.schema_version, + message_format=self.message_format, + yamllint_config=self.yamllint_config, + ) + for file in self.yaml_files: + taxonomy = parser.parse(file) + if taxonomy.version > 1: + attribution_path = taxonomy.rel_path.with_name("attribution.txt") + if not attribution_path.is_file(): + taxonomy.error( + "The \"%s\" file does not exist or is not a file", + attribution_path.name, + ) + elif os.path.getsize(attribution_path) == 0: + taxonomy.error( + "The \"%s\" file must be non-empty", + taxonomy.path.with_name(attribution_path.name), + ) + if taxonomy.errors > 0: + exit_code = 1 + if not self.yaml_files: + print("No yaml files specified.") + return exit_code + + +def cli() -> int: + parser = argparse.ArgumentParser( + description=""" + Check Taxonomy YAML files for linting and schema validation. + """, + ) + parser.add_argument( + "-t", + "--taxonomy-folder", + action="append", + metavar="TAXONOMY_FOLDER", + dest="taxonomy_folders", + help=""" + A taxonomy folder. This argument can be specified multiple times. + Alternately, the TAXONOMY_FOLDERS environment variable can be used + to specify a space-separated list of folders. + """, + default=os.environ.get("TAXONOMY_FOLDERS"), + ) + parser.add_argument( + "-v", + "--schema-version", + help=""" + The version of the Taxonomy schema. + Alternately, the SCHEMA_VERSION environment variable can be used + to specify the version. + Specifying a version less than 1 will use the schema version + specified by each YAML document's "version" key. + If not specified, the highest schema version is used. + """, + default=os.environ.get("SCHEMA_VERSION"), + type=int, + ) + parser.add_argument( + "-l", + "--lint-config", + dest="yamllint_config", + help=""" + The yamllint configuration data. + Alternately, the YAMLLINT_CONFIG environment variable can be used + to specify the configuration data. + """, + default=os.environ.get("YAMLLINT_CONFIG"), + ) + parser.add_argument( + "-f", + "--format", + help="The message format.", + dest="message_format", + choices=["standard", "github", "auto"], + default=None, + ) + parser.add_argument( + "yaml_file", + help="A qna.yaml file.", + nargs="*", + type=pathlib.Path, + ) + args = parser.parse_args() + + taxonomy_folders = args.taxonomy_folders + if isinstance(taxonomy_folders, str): + taxonomy_folders = taxonomy_folders.split() + check_yaml = CheckYaml( + yaml_files=args.yaml_file, + taxonomy_folders=taxonomy_folders, + yamllint_config=args.yamllint_config, + schema_version=args.schema_version, + message_format=args.message_format, + ) + exit_code = check_yaml.check() + return exit_code + + +if __name__ == "__main__": + sys.exit(cli()) diff --git a/taxonomy/scripts/requirements.txt b/taxonomy/scripts/requirements.txt new file mode 100644 index 000000000..3b6396ba4 --- /dev/null +++ b/taxonomy/scripts/requirements.txt @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 + +instructlab-schema>=0.4.0