diff --git a/.github/workflows/languages-publish.yml b/.github/workflows/languages-publish.yml index f1d0f7e8b..c93b672b8 100644 --- a/.github/workflows/languages-publish.yml +++ b/.github/workflows/languages-publish.yml @@ -25,7 +25,6 @@ jobs: cache: "pnpm" cache-dependency-path: | packages/languages/pnpm-lock.yaml - packages/doc-internal/pnpm-lock.yaml # setting a registry enables the NODE_AUTH_TOKEN env variable where we can set an npm token. REQUIRED registry-url: "https://registry.npmjs.org" - run: pnpm install @@ -36,8 +35,6 @@ jobs: BUMPED_VERSION=$(node -p "require('semver').inc('$PACKAGE_VERSION', '${{ github.event.inputs.newversion }}')") # Update package.json with the new version node -e "const fs = require('fs'); const package = JSON.parse(fs.readFileSync('./package.json')); package.version = '$BUMPED_VERSION'; fs.writeFileSync('./package.json', JSON.stringify(package, null, '\t') + '\n');" - pnpm --filter doc-internal run fix-cdn-versions - git add ../.. git commit -m "🔖 @hugginface/languages $BUMPED_VERSION" git tag "languages-v$BUMPED_VERSION" - run: pnpm publish --no-git-checks . diff --git a/.github/workflows/tasks-publish.yml b/.github/workflows/tasks-publish.yml new file mode 100644 index 000000000..42319d514 --- /dev/null +++ b/.github/workflows/tasks-publish.yml @@ -0,0 +1,51 @@ +name: Tasks - Version and Release + +on: + workflow_dispatch: + inputs: + newversion: + description: "Semantic Version Bump Type (major minor patch)" + default: patch + +defaults: + run: + working-directory: packages/tasks + +jobs: + version_and_release: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.BOT_ACCESS_TOKEN }} + - run: corepack enable + - uses: actions/setup-node@v3 + with: + node-version: "18" + cache: "pnpm" + cache-dependency-path: | + packages/tasks/pnpm-lock.yaml + # setting a registry enables the NODE_AUTH_TOKEN env variable where we can set an npm token. REQUIRED + registry-url: "https://registry.npmjs.org" + - run: pnpm install + - run: git config --global user.name machineuser + - run: git config --global user.email infra+machineuser@huggingface.co + - run: | + PACKAGE_VERSION=$(node -p "require('./package.json').version") + BUMPED_VERSION=$(node -p "require('semver').inc('$PACKAGE_VERSION', '${{ github.event.inputs.newversion }}')") + # Update package.json with the new version + node -e "const fs = require('fs'); const package = JSON.parse(fs.readFileSync('./package.json')); package.version = '$BUMPED_VERSION'; fs.writeFileSync('./package.json', JSON.stringify(package, null, '\t') + '\n');" + git commit -m "🔖 @hugginface/tasks $BUMPED_VERSION" + git tag "tasks-v$BUMPED_VERSION" + - run: pnpm publish --no-git-checks . + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} + - run: git push --follow-tags + # hack - reuse actions/setup-node@v3 just to set a new registry + - uses: actions/setup-node@v3 + with: + node-version: "18" + registry-url: "https://npm.pkg.github.com" + - run: pnpm publish --no-git-checks . + env: + NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/README.md b/README.md index f851ab7eb..755bf891b 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,6 @@ This is a collection of JS libraries to interact with the Hugging Face API, with - [@huggingface/inference](packages/inference/README.md): Use the Inference API to make calls to 100,000+ Machine Learning models, or your own [inference endpoints](https://hf.co/docs/inference-endpoints/)! - [@huggingface/agents](packages/agents/README.md): Interact with HF models through a natural language interface - [@huggingface/hub](packages/hub/README.md): Interact with huggingface.co to create or delete repos and commit / download files -- [@huggingface/languages](packages/languages/README.md): List of all languages supported by the huggingface hub With more to come, like `@huggingface/endpoints` to manage your HF Endpoints! diff --git a/packages/languages/package.json b/packages/languages/package.json index cec75ff1a..080d50d89 100644 --- a/packages/languages/package.json +++ b/packages/languages/package.json @@ -30,7 +30,7 @@ }, "files": [ "dist", - "index.ts", + "src", "tsconfig.json" ], "keywords": [ diff --git a/packages/tasks/.prettierignore b/packages/tasks/.prettierignore new file mode 100644 index 000000000..cac0c6949 --- /dev/null +++ b/packages/tasks/.prettierignore @@ -0,0 +1,4 @@ +pnpm-lock.yaml +# In order to avoid code samples to have tabs, they don't display well on npm +README.md +dist \ No newline at end of file diff --git a/packages/tasks/README.md b/packages/tasks/README.md new file mode 100644 index 000000000..67285ef19 --- /dev/null +++ b/packages/tasks/README.md @@ -0,0 +1,20 @@ +# Tasks + +This package contains data used for https://huggingface.co/tasks. + +## Philosophy behind Tasks + +The Task pages are made to lower the barrier of entry to understand a task that can be solved with machine learning and use or train a model to accomplish it. It's a collaborative documentation effort made to help out software developers, social scientists, or anyone with no background in machine learning that is interested in understanding how machine learning models can be used to solve a problem. + +The task pages avoid jargon to let everyone understand the documentation, and if specific terminology is needed, it is explained on the most basic level possible. This is important to understand before contributing to Tasks: at the end of every task page, the user is expected to be able to find and pull a model from the Hub and use it on their data and see if it works for their use case to come up with a proof of concept. + +## How to Contribute +You can open a pull request to contribute a new documentation about a new task. Under `src` we have a folder for every task that contains two files, `about.md` and `data.ts`. `about.md` contains the markdown part of the page, use cases, resources and minimal code block to infer a model that belongs to the task. `data.ts` contains redirections to canonical models and datasets, metrics, the schema of the task and the information the inference widget needs. + +![Anatomy of a Task Page](https://huggingface.co/datasets/huggingfacejs/tasks/resolve/main/contribution-guide/anatomy.png) + +We have a [`dataset`](https://huggingface.co/datasets/huggingfacejs/tasks) that contains data used in the inference widget. The last file is `const.ts`, which has the task to library mapping (e.g. spacy to token-classification) where you can add a library. They will look in the top right corner like below. + +![Libraries of a Task](https://huggingface.co/datasets/huggingfacejs/tasks/resolve/main/contribution-guide/libraries.png) + +This might seem overwhelming, but you don't necessarily need to add all of these in one pull request or on your own, you can simply contribute one section. Feel free to ask for help whenever you need. \ No newline at end of file diff --git a/packages/tasks/package.json b/packages/tasks/package.json new file mode 100644 index 000000000..8f4a36cf3 --- /dev/null +++ b/packages/tasks/package.json @@ -0,0 +1,45 @@ +{ + "name": "@huggingface/tasks", + "packageManager": "pnpm@8.3.1", + "version": "0.0.3", + "description": "List of ISO-639 languages used in the Hub", + "repository": "https://github.com/huggingface/huggingface.js.git", + "publishConfig": { + "access": "public" + }, + "main": "./dist/index.js", + "module": "./dist/index.mjs", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "require": "./dist/index.js", + "import": "./dist/index.mjs" + } + }, + "source": "src/index.ts", + "scripts": { + "lint": "eslint --quiet --fix --ext .cjs,.ts .", + "lint:check": "eslint --ext .cjs,.ts .", + "format": "prettier --write .", + "format:check": "prettier --check .", + "prepublishOnly": "pnpm run build", + "build": "tsup src/index.ts --format cjs,esm --clean --dts", + "type-check": "tsc" + }, + "files": [ + "dist", + "src", + "tsconfig.json" + ], + "keywords": [ + "huggingface", + "hub", + "languages" + ], + "author": "Hugging Face", + "license": "MIT", + "devDependencies": { + "typescript": "^5.0.4" + } +} diff --git a/packages/tasks/pnpm-lock.yaml b/packages/tasks/pnpm-lock.yaml new file mode 100644 index 000000000..a3ed38c89 --- /dev/null +++ b/packages/tasks/pnpm-lock.yaml @@ -0,0 +1,14 @@ +lockfileVersion: '6.0' + +devDependencies: + typescript: + specifier: ^5.0.4 + version: 5.0.4 + +packages: + + /typescript@5.0.4: + resolution: {integrity: sha512-cW9T5W9xY37cc+jfEnaUvX91foxtHkza3Nw3wkoF4sSlKn0MONdkdEndig/qPBWXNkmplh3NzayQzCiHM4/hqw==} + engines: {node: '>=12.20'} + hasBin: true + dev: true diff --git a/packages/tasks/src/Types.ts b/packages/tasks/src/Types.ts new file mode 100644 index 000000000..0824893f1 --- /dev/null +++ b/packages/tasks/src/Types.ts @@ -0,0 +1,64 @@ +import type { ModelLibraryKey } from "./modelLibraries"; +import type { PipelineType } from "./pipelines"; + +export interface ExampleRepo { + description: string; + id: string; +} + +export type TaskDemoEntry = + | { + filename: string; + type: "audio"; + } + | { + data: Array<{ + label: string; + score: number; + }>; + type: "chart"; + } + | { + filename: string; + type: "img"; + } + | { + table: string[][]; + type: "tabular"; + } + | { + content: string; + label: string; + type: "text"; + } + | { + text: string; + tokens: Array<{ + end: number; + start: number; + type: string; + }>; + type: "text-with-tokens"; + }; + +export interface TaskDemo { + inputs: TaskDemoEntry[]; + outputs: TaskDemoEntry[]; +} + +export interface TaskData { + datasets: ExampleRepo[]; + demo: TaskDemo; + id: PipelineType; + isPlaceholder?: boolean; + label: string; + libraries: ModelLibraryKey[]; + metrics: ExampleRepo[]; + models: ExampleRepo[]; + spaces: ExampleRepo[]; + summary: string; + widgetModels: string[]; + youtubeId?: string; +} + +export type TaskDataCustom = Omit; diff --git a/packages/tasks/src/audio-classification/about.md b/packages/tasks/src/audio-classification/about.md new file mode 100644 index 000000000..9b1d7c6e9 --- /dev/null +++ b/packages/tasks/src/audio-classification/about.md @@ -0,0 +1,85 @@ +## Use Cases + +### Command Recognition + +Command recognition or keyword spotting classifies utterances into a predefined set of commands. This is often done on-device for fast response time. + +As an example, using the Google Speech Commands dataset, given an input, a model can classify which of the following commands the user is typing: + +``` +'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'silence' +``` + +Speechbrain models can easily perform this task with just a couple of lines of code! + +```python +from speechbrain.pretrained import EncoderClassifier +model = EncoderClassifier.from_hparams( + "speechbrain/google_speech_command_xvector" +) +model.classify_file("file.wav") +``` + +### Language Identification + +Datasets such as VoxLingua107 allow anyone to train language identification models for up to 107 languages! This can be extremely useful as a preprocessing step for other systems. Here's an example [model](https://huggingface.co/TalTechNLP/voxlingua107-epaca-tdnn)trained on VoxLingua107. + +### Emotion recognition + +Emotion recognition is self explanatory. In addition to trying the widgets, you can use the Inference API to perform audio classification. Here is a simple example that uses a [HuBERT](https://huggingface.co/superb/hubert-large-superb-er) model fine-tuned for this task. + +```python +import json +import requests + +headers = {"Authorization": f"Bearer {API_TOKEN}"} +API_URL = "https://api-inference.huggingface.co/models/superb/hubert-large-superb-er" + +def query(filename): + with open(filename, "rb") as f: + data = f.read() + response = requests.request("POST", API_URL, headers=headers, data=data) + return json.loads(response.content.decode("utf-8")) + +data = query("sample1.flac") +# [{'label': 'neu', 'score': 0.60}, +# {'label': 'hap', 'score': 0.20}, +# {'label': 'ang', 'score': 0.13}, +# {'label': 'sad', 'score': 0.07}] +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer with audio classification models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.audioClassification({ + data: await (await fetch("sample.flac")).blob(), + model: "facebook/mms-lid-126", +}); +``` + +### Speaker Identification + +Speaker Identification is classifying the audio of the person speaking. Speakers are usually predefined. You can try out this task with [this model](https://huggingface.co/superb/wav2vec2-base-superb-sid). A useful dataset for this task is VoxCeleb1. + +## Solving audio classification for your own data + +We have some great news! You can do fine-tuning (transfer learning) to train a well-performing model without requiring as much data. Pretrained models such as Wav2Vec2 and HuBERT exist. [Facebook's Wav2Vec2 XLS-R model](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) is a large multilingual model trained on 128 languages and with 436K hours of speech. + +## Useful Resources + +Would you like to learn more about the topic? Awesome! Here you can find some curated resources that you may find helpful! + +### Notebooks + +- [PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/audio_classification.ipynb) + +### Scripts for training + +- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification) + +### Documentation + +- [Audio classification task guide](https://huggingface.co/docs/transformers/tasks/audio_classification) diff --git a/packages/tasks/src/audio-classification/data.ts b/packages/tasks/src/audio-classification/data.ts new file mode 100644 index 000000000..92e879c5c --- /dev/null +++ b/packages/tasks/src/audio-classification/data.ts @@ -0,0 +1,77 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A benchmark of 10 different audio tasks.", + id: "superb", + }, + ], + demo: { + inputs: [ + { + filename: "audio.wav", + type: "audio", + }, + ], + outputs: [ + { + data: [ + { + label: "Up", + score: 0.2, + }, + { + label: "Down", + score: 0.8, + }, + ], + type: "chart", + }, + ], + }, + metrics: [ + { + description: "", + id: "accuracy", + }, + { + description: "", + id: "recall", + }, + { + description: "", + id: "precision", + }, + { + description: "", + id: "f1", + }, + ], + models: [ + { + description: "An easy-to-use model for Command Recognition.", + id: "speechbrain/google_speech_command_xvector", + }, + { + description: "An Emotion Recognition model.", + id: "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", + }, + { + description: "A language identification model.", + id: "facebook/mms-lid-126", + }, + ], + spaces: [ + { + description: "An application that can predict the language spoken in a given audio.", + id: "akhaliq/Speechbrain-audio-classification", + }, + ], + summary: + "Audio classification is the task of assigning a label or class to a given audio. It can be used for recognizing which command a user is giving or the emotion of a statement, as well as identifying a speaker.", + widgetModels: ["facebook/mms-lid-126"], + youtubeId: "KWwzcmG98Ds", +}; + +export default taskData; diff --git a/packages/tasks/src/audio-to-audio/about.md b/packages/tasks/src/audio-to-audio/about.md new file mode 100644 index 000000000..e56275277 --- /dev/null +++ b/packages/tasks/src/audio-to-audio/about.md @@ -0,0 +1,56 @@ +## Use Cases + +### Speech Enhancement (Noise removal) + +Speech Enhancement is a bit self explanatory. It improves (or enhances) the quality of an audio by removing noise. There are multiple libraries to solve this task, such as Speechbrain, Asteroid and ESPNet. Here is a simple example using Speechbrain + +```python +from speechbrain.pretrained import SpectralMaskEnhancement +model = SpectralMaskEnhancement.from_hparams( + "speechbrain/mtl-mimic-voicebank" +) +model.enhance_file("file.wav") +``` + +Alternatively, you can use the [Inference API](https://huggingface.co/inference-api) to solve this task + +```python +import json +import requests + +headers = {"Authorization": f"Bearer {API_TOKEN}"} +API_URL = "https://api-inference.huggingface.co/models/speechbrain/mtl-mimic-voicebank" + +def query(filename): + with open(filename, "rb") as f: + data = f.read() + response = requests.request("POST", API_URL, headers=headers, data=data) + return json.loads(response.content.decode("utf-8")) + +data = query("sample1.flac") +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer with audio-to-audio models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.audioToAudio({ + data: await (await fetch("sample.flac")).blob(), + model: "speechbrain/sepformer-wham", +}); +``` + +### Audio Source Separation + +Audio Source Separation allows you to isolate different sounds from individual sources. For example, if you have an audio file with multiple people speaking, you can get an audio file for each of them. You can then use an Automatic Speech Recognition system to extract the text from each of these sources as an initial step for your system! + +Audio-to-Audio can also be used to remove noise from audio files: you get one audio for the person speaking and another audio for the noise. This can also be useful when you have multi-person audio with some noise: yyou can get one audio for each person and then one audio for the noise. + +## Training a model for your own data + +If you want to learn how to train models for the Audio-to-Audio task, we recommend the following tutorials: + +- [Speech Enhancement](https://speechbrain.github.io/tutorial_enhancement.html) +- [Source Separation](https://speechbrain.github.io/tutorial_separation.html) diff --git a/packages/tasks/src/audio-to-audio/data.ts b/packages/tasks/src/audio-to-audio/data.ts new file mode 100644 index 000000000..56f03188e --- /dev/null +++ b/packages/tasks/src/audio-to-audio/data.ts @@ -0,0 +1,66 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "512-element X-vector embeddings of speakers from CMU ARCTIC dataset.", + id: "Matthijs/cmu-arctic-xvectors", + }, + ], + demo: { + inputs: [ + { + filename: "input.wav", + type: "audio", + }, + ], + outputs: [ + { + filename: "label-0.wav", + type: "audio", + }, + { + filename: "label-1.wav", + type: "audio", + }, + ], + }, + metrics: [ + { + description: + "The Signal-to-Noise ratio is the relationship between the target signal level and the background noise level. It is calculated as the logarithm of the target signal divided by the background noise, in decibels.", + id: "snri", + }, + { + description: + "The Signal-to-Distortion ratio is the relationship between the target signal and the sum of noise, interference, and artifact errors", + id: "sdri", + }, + ], + models: [ + { + description: "A solid model of audio source separation.", + id: "speechbrain/sepformer-wham", + }, + { + description: "A speech enhancement model.", + id: "speechbrain/metricgan-plus-voicebank", + }, + ], + spaces: [ + { + description: "An application for speech separation.", + id: "younver/speechbrain-speech-separation", + }, + { + description: "An application for audio style transfer.", + id: "nakas/audio-diffusion_style_transfer", + }, + ], + summary: + "Audio-to-Audio is a family of tasks in which the input is an audio and the output is one or multiple generated audios. Some example tasks are speech enhancement and source separation.", + widgetModels: ["speechbrain/sepformer-wham"], + youtubeId: "iohj7nCCYoM", +}; + +export default taskData; diff --git a/packages/tasks/src/automatic-speech-recognition/about.md b/packages/tasks/src/automatic-speech-recognition/about.md new file mode 100644 index 000000000..3871cba1c --- /dev/null +++ b/packages/tasks/src/automatic-speech-recognition/about.md @@ -0,0 +1,87 @@ +## Use Cases + +### Virtual Speech Assistants + +Many edge devices have an embedded virtual assistant to interact with the end users better. These assistances rely on ASR models to recognize different voice commands to perform various tasks. For instance, you can ask your phone for dialing a phone number, ask a general question, or schedule a meeting. + +### Caption Generation + +A caption generation model takes audio as input from sources to generate automatic captions through transcription, for live-streamed or recorded videos. This can help with content accessibility. For example, an audience watching a video that includes a non-native language, can rely on captions to interpret the content. It can also help with information retention at online-classes environments improving knowledge assimilation while reading and taking notes faster. + +## Task Variants + +### Multilingual ASR + +Multilingual ASR models can convert audio inputs with multiple languages into transcripts. Some multilingual ASR models include [language identification](https://huggingface.co/tasks/audio-classification) blocks to improve the performance. + +The use of Multilingual ASR has become popular, the idea of maintaining just a single model for all language can simplify the production pipeline. Take a look at [Whisper](https://huggingface.co/openai/whisper-large-v2) to get an idea on how 100+ languages can be processed by a single model. + +## Inference + +The Hub contains over [~9,000 ASR models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads) that you can use right away by trying out the widgets directly in the browser or calling the models as a service using the Inference API. Here is a simple code snippet to do exactly this: + +```python +import json +import requests + +headers = {"Authorization": f"Bearer {API_TOKEN}"} +API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v2" + +def query(filename): + with open(filename, "rb") as f: + data = f.read() + response = requests.request("POST", API_URL, headers=headers, data=data) + return json.loads(response.content.decode("utf-8")) + +data = query("sample1.flac") +``` + +You can also use libraries such as [transformers](https://huggingface.co/models?library=transformers&pipeline_tag=automatic-speech-recognition&sort=downloads), [speechbrain](https://huggingface.co/models?library=speechbrain&pipeline_tag=automatic-speech-recognition&sort=downloads), [NeMo](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&library=nemo&sort=downloads) and [espnet](https://huggingface.co/models?library=espnet&pipeline_tag=automatic-speech-recognition&sort=downloads) if you want one-click managed Inference without any hassle. + +```python +from transformers import pipeline + +with open("sample.flac", "rb") as f: + data = f.read() + +pipe = pipeline("automatic-speech-recognition", "openai/whisper-large-v2") +pipe("sample.flac") +# {'text': "GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES IN DRAUGHTY SCHOOL ROOMS DAY AFTER DAY FOR A FORTNIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SOME PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS"} +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to transcribe text with javascript using models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.automaticSpeechRecognition({ + data: await (await fetch("sample.flac")).blob(), + model: "openai/whisper-large-v2", +}); +``` + +## Solving ASR for your own data + +We have some great news! You can fine-tune (transfer learning) a foundational speech model on a specific language without tonnes of data. Pretrained models such as Whisper, Wav2Vec2-MMS and HuBERT exist. [OpenAI's Whisper model](https://huggingface.co/openai/whisper-large-v2) is a large multilingual model trained on 100+ languages and with 680K hours of speech. + +The following detailed [blog post](https://huggingface.co/blog/fine-tune-whisper) shows how to fine-tune a pre-trained Whisper checkpoint on labeled data for ASR. With the right data and strategy you can fine-tune a high-performant model on a free Google Colab instance too. We suggest to read the blog post for more info! + +## Hugging Face Whisper Event + +On December 2022, over 450 participants collaborated, fine-tuned and shared 600+ ASR Whisper models in 100+ different languages. You can compare these models on the event's speech recognition [leaderboard](https://huggingface.co/spaces/whisper-event/leaderboard?dataset=mozilla-foundation%2Fcommon_voice_11_0&config=ar&split=test). + +These events help democratize ASR for all languages, including low-resource languages. In addition to the trained models, the [event](https://github.com/huggingface/community-events/tree/main/whisper-fine-tuning-event) helps to build practical collaborative knowledge. + +## Useful Resources + +- [Fine-tuning MetaAI's MMS Adapter Models for Multi-Lingual ASR](https://huggingface.co/blog/mms_adapters) +- [Making automatic speech recognition work on large files with Wav2Vec2 in 🤗 Transformers](https://huggingface.co/blog/asr-chunking) +- [Boosting Wav2Vec2 with n-grams in 🤗 Transformers](https://huggingface.co/blog/wav2vec2-with-ngram) +- [ML for Audio Study Group - Intro to Audio and ASR Deep Dive](https://www.youtube.com/watch?v=D-MH6YjuIlE) +- [Massively Multilingual ASR: 50 Languages, 1 Model, 1 Billion Parameters](https://arxiv.org/pdf/2007.03001.pdf) +- An ASR toolkit made by [NVIDIA: NeMo](https://github.com/NVIDIA/NeMo) with code and pretrained models useful for new ASR models. Watch the [introductory video](https://www.youtube.com/embed/wBgpMf_KQVw) for an overview. +- [An introduction to SpeechT5, a multi-purpose speech recognition and synthesis model](https://huggingface.co/blog/speecht5) +- [A guide on Fine-tuning Whisper For Multilingual ASR with 🤗Transformers](https://huggingface.co/blog/fine-tune-whisper) +- [Automatic speech recognition task guide](https://huggingface.co/docs/transformers/tasks/asr) +- [Speech Synthesis, Recognition, and More With SpeechT5](https://huggingface.co/blog/speecht5) diff --git a/packages/tasks/src/automatic-speech-recognition/data.ts b/packages/tasks/src/automatic-speech-recognition/data.ts new file mode 100644 index 000000000..05d13e14c --- /dev/null +++ b/packages/tasks/src/automatic-speech-recognition/data.ts @@ -0,0 +1,78 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "18,000 hours of multilingual audio-text dataset in 108 languages.", + id: "mozilla-foundation/common_voice_13_0", + }, + { + description: "An English dataset with 1,000 hours of data.", + id: "librispeech_asr", + }, + { + description: "High quality, multi-speaker audio data and their transcriptions in various languages.", + id: "openslr", + }, + ], + demo: { + inputs: [ + { + filename: "input.flac", + type: "audio", + }, + ], + outputs: [ + { + /// GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES I + label: "Transcript", + content: "Going along slushy country roads and speaking to damp audiences in...", + type: "text", + }, + ], + }, + metrics: [ + { + description: "", + id: "wer", + }, + { + description: "", + id: "cer", + }, + ], + models: [ + { + description: "A powerful ASR model by OpenAI.", + id: "openai/whisper-large-v2", + }, + { + description: "A good generic ASR model by MetaAI.", + id: "facebook/wav2vec2-base-960h", + }, + { + description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.", + id: "facebook/s2t-small-mustc-en-fr-st", + }, + ], + spaces: [ + { + description: "A powerful general-purpose speech recognition application.", + id: "openai/whisper", + }, + { + description: "Fastest speech recognition application.", + id: "sanchit-gandhi/whisper-jax", + }, + { + description: "An application that transcribes speeches in YouTube videos.", + id: "jeffistyping/Youtube-Whisperer", + }, + ], + summary: + "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.", + widgetModels: ["openai/whisper-large-v2"], + youtubeId: "TksaY_FDgnk", +}; + +export default taskData; diff --git a/packages/tasks/src/const.ts b/packages/tasks/src/const.ts new file mode 100644 index 000000000..34fb9b24a --- /dev/null +++ b/packages/tasks/src/const.ts @@ -0,0 +1,59 @@ +import type { ModelLibraryKey } from "./modelLibraries"; +import type { PipelineType } from "./pipelines"; + +/** + * Model libraries compatible with each ML task + */ +export const TASKS_MODEL_LIBRARIES: Record = { + "audio-classification": ["speechbrain", "transformers"], + "audio-to-audio": ["asteroid", "speechbrain"], + "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"], + conversational: ["transformers"], + "depth-estimation": ["transformers"], + "document-question-answering": ["transformers"], + "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"], + "fill-mask": ["transformers", "transformers.js"], + "graph-ml": ["transformers"], + "image-classification": ["keras", "timm", "transformers", "transformers.js"], + "image-segmentation": ["transformers", "transformers.js"], + "image-to-image": [], + "image-to-text": ["transformers.js"], + "video-classification": [], + "multiple-choice": ["transformers"], + "object-detection": ["transformers", "transformers.js"], + other: [], + "question-answering": ["adapter-transformers", "allennlp", "transformers", "transformers.js"], + robotics: [], + "reinforcement-learning": ["transformers", "stable-baselines3", "ml-agents", "sample-factory"], + "sentence-similarity": ["sentence-transformers", "spacy", "transformers.js"], + summarization: ["transformers", "transformers.js"], + "table-question-answering": ["transformers"], + "table-to-text": ["transformers"], + "tabular-classification": ["sklearn"], + "tabular-regression": ["sklearn"], + "tabular-to-text": ["transformers"], + "text-classification": ["adapter-transformers", "spacy", "transformers", "transformers.js"], + "text-generation": ["transformers", "transformers.js"], + "text-retrieval": [], + "text-to-image": [], + "text-to-speech": ["espnet", "tensorflowtts", "transformers"], + "text-to-audio": ["transformers"], + "text-to-video": [], + "text2text-generation": ["transformers", "transformers.js"], + "time-series-forecasting": [], + "token-classification": [ + "adapter-transformers", + "flair", + "spacy", + "span-marker", + "stanza", + "transformers", + "transformers.js", + ], + translation: ["transformers", "transformers.js"], + "unconditional-image-generation": [], + "visual-question-answering": [], + "voice-activity-detection": [], + "zero-shot-classification": ["transformers", "transformers.js"], + "zero-shot-image-classification": ["transformers.js"], +}; diff --git a/packages/tasks/src/conversational/about.md b/packages/tasks/src/conversational/about.md new file mode 100644 index 000000000..d2141ba20 --- /dev/null +++ b/packages/tasks/src/conversational/about.md @@ -0,0 +1,50 @@ +## Use Cases + +### Chatbot 💬 + +Chatbots are used to have conversations instead of providing direct contact with a live human. They are used to provide customer service, sales, and can even be used to play games (see [ELIZA](https://en.wikipedia.org/wiki/ELIZA) from 1966 for one of the earliest examples). + +## Voice Assistants 🎙️ + +Conversational response models are used as part of voice assistants to provide appropriate responses to voice based queries. + +## Inference + +You can infer with Conversational models with the 🤗 Transformers library using the `conversational` pipeline. This pipeline takes a conversation prompt or a list of conversations and generates responses for each prompt. The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task (see https://huggingface.co/models?filter=conversational for a list of updated Conversational models). + +```python +from transformers import pipeline, Conversation +converse = pipeline("conversational") + +conversation_1 = Conversation("Going to the movies tonight - any suggestions?") +conversation_2 = Conversation("What's the last book you have read?") +converse([conversation_1, conversation_2]) + +## Output: +## Conversation 1 +## user >> Going to the movies tonight - any suggestions? +## bot >> The Big Lebowski , +## Conversation 2 +## user >> What's the last book you have read? +## bot >> The Last Question +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer with conversational models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.conversational({ + model: "facebook/blenderbot-400M-distill", + inputs: "Going to the movies tonight - any suggestions?", +}); +``` + +## Useful Resources + +- Learn how ChatGPT and InstructGPT work in this blog: [Illustrating Reinforcement Learning from Human Feedback (RLHF)](https://huggingface.co/blog/rlhf) +- [Reinforcement Learning from Human Feedback From Zero to ChatGPT](https://www.youtube.com/watch?v=EAd4oQtEJOM) +- [A guide on Dialog Agents](https://huggingface.co/blog/dialog-agents) + +This page was made possible thanks to the efforts of [Viraat Aryabumi](https://huggingface.co/viraat). diff --git a/packages/tasks/src/conversational/data.ts b/packages/tasks/src/conversational/data.ts new file mode 100644 index 000000000..85c405761 --- /dev/null +++ b/packages/tasks/src/conversational/data.ts @@ -0,0 +1,66 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: + "A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.", + id: "blended_skill_talk", + }, + { + description: + "ConvAI is a dataset of human-to-bot conversations labeled for quality. This data can be used to train a metric for evaluating dialogue systems", + id: "conv_ai_2", + }, + { + description: "EmpatheticDialogues, is a dataset of 25k conversations grounded in emotional situations", + id: "empathetic_dialogues", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "Hey my name is Julien! How are you?", + type: "text", + }, + ], + outputs: [ + { + label: "Answer", + content: "Hi Julien! My name is Julia! I am well.", + type: "text", + }, + ], + }, + metrics: [ + { + description: + "BLEU score is calculated by counting the number of shared single or subsequent tokens between the generated sequence and the reference. Subsequent n tokens are called “n-grams”. Unigram refers to a single token while bi-gram refers to token pairs and n-grams refer to n subsequent tokens. The score ranges from 0 to 1, where 1 means the translation perfectly matched and 0 did not match at all", + id: "bleu", + }, + ], + models: [ + { + description: "A faster and smaller model than the famous BERT model.", + id: "facebook/blenderbot-400M-distill", + }, + { + description: + "DialoGPT is a large-scale pretrained dialogue response generation model for multiturn conversations.", + id: "microsoft/DialoGPT-large", + }, + ], + spaces: [ + { + description: "A chatbot based on Blender model.", + id: "EXFINITE/BlenderBot-UI", + }, + ], + summary: + "Conversational response modelling is the task of generating conversational text that is relevant, coherent and knowledgable given a prompt. These models have applications in chatbots, and as a part of voice assistants", + widgetModels: ["facebook/blenderbot-400M-distill"], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/depth-estimation/about.md b/packages/tasks/src/depth-estimation/about.md new file mode 100644 index 000000000..b83d60e24 --- /dev/null +++ b/packages/tasks/src/depth-estimation/about.md @@ -0,0 +1,36 @@ +## Use Cases +Depth estimation models can be used to estimate the depth of different objects present in an image. + +### Estimation of Volumetric Information +Depth estimation models are widely used to study volumetric formation of objects present inside an image. This is an important use case in the domain of computer graphics. + +### 3D Representation + +Depth estimation models can also be used to develop a 3D representation from a 2D image. + +## Inference + +With the `transformers` library, you can use the `depth-estimation` pipeline to infer with image classification models. You can initialize the pipeline with a model id from the Hub. If you do not provide a model id it will initialize with [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) by default. When calling the pipeline you just need to specify a path, http link or an image loaded in PIL. Additionally, you can find a comprehensive list of various depth estimation models at [this link](https://huggingface.co/models?pipeline_tag=depth-estimation). + +```python +from transformers import pipeline + +estimator = pipeline(task="depth-estimation", model="Intel/dpt-large") +result = estimator(images="http://images.cocodataset.org/val2017/000000039769.jpg") +result + +# {'predicted_depth': tensor([[[ 6.3199, 6.3629, 6.4148, ..., 10.4104, 10.5109, 10.3847], +# [ 6.3850, 6.3615, 6.4166, ..., 10.4540, 10.4384, 10.4554], +# [ 6.3519, 6.3176, 6.3575, ..., 10.4247, 10.4618, 10.4257], +# ..., +# [22.3772, 22.4624, 22.4227, ..., 22.5207, 22.5593, 22.5293], +# [22.5073, 22.5148, 22.5114, ..., 22.6604, 22.6344, 22.5871], +# [22.5176, 22.5275, 22.5218, ..., 22.6282, 22.6216, 22.6108]]]), +# 'depth': } + +# You can visualize the result just by calling `result["depth"]`. +``` + +## Useful Resources + +- [Monocular depth estimation task guide](https://huggingface.co/docs/transformers/tasks/monocular_depth_estimation) diff --git a/packages/tasks/src/depth-estimation/data.ts b/packages/tasks/src/depth-estimation/data.ts new file mode 100644 index 000000000..1a9b0d2a1 --- /dev/null +++ b/packages/tasks/src/depth-estimation/data.ts @@ -0,0 +1,52 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "NYU Depth V2 Dataset: Video dataset containing both RGB and depth sensor data", + id: "sayakpaul/nyu_depth_v2", + }, + ], + demo: { + inputs: [ + { + filename: "depth-estimation-input.jpg", + type: "img", + }, + ], + outputs: [ + { + filename: "depth-estimation-output.png", + type: "img", + }, + ], + }, + metrics: [], + models: [ + { + // TO DO: write description + description: "Strong Depth Estimation model trained on 1.4 million images.", + id: "Intel/dpt-large", + }, + { + // TO DO: write description + description: "Strong Depth Estimation model trained on the KITTI dataset.", + id: "vinvino02/glpn-kitti", + }, + ], + spaces: [ + { + description: "An application that predicts the depth of an image and then reconstruct the 3D model as voxels.", + id: "radames/dpt-depth-estimation-3d-voxels", + }, + { + description: "An application that can estimate the depth in a given image.", + id: "keras-io/Monocular-Depth-Estimation", + }, + ], + summary: "Depth estimation is the task of predicting depth of the objects present in an image.", + widgetModels: [""], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/document-question-answering/about.md b/packages/tasks/src/document-question-answering/about.md new file mode 100644 index 000000000..528c29ec9 --- /dev/null +++ b/packages/tasks/src/document-question-answering/about.md @@ -0,0 +1,53 @@ +## Use Cases + +Document Question Answering models can be used to answer natural language questions about documents. Typically, document QA models consider textual, layout and potentially visual information. This is useful when the question requires some understanding of the visual aspects of the document. +Nevertheless, certain document QA models can work without document images. Hence the task is not limited to visually-rich documents and allows users to ask questions based on spreadsheets, text PDFs, etc! + +### Document Parsing + +One of the most popular use cases of document question answering models is the parsing of structured documents. For example, you can extract the name, address, and other information from a form. You can also use the model to extract information from a table, or even a resume. + +### Invoice Information Extraction + +Another very popular use case is invoice information extraction. For example, you can extract the invoice number, the invoice date, the total amount, the VAT number, and the invoice recipient. + +## Inference + +You can infer with Document QA models with the 🤗 Transformers library using the [`document-question-answering` pipeline](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.DocumentQuestionAnsweringPipeline). If no model checkpoint is given, the pipeline will be initialized with [`impira/layoutlm-document-qa`](https://huggingface.co/impira/layoutlm-document-qa). This pipeline takes question(s) and document(s) as input, and returns the answer. +👉 Note that the question answering task solved here is extractive: the model extracts the answer from a context (the document). + +```python +from transformers import pipeline +from PIL import Image + +pipe = pipeline("document-question-answering", model="naver-clova-ix/donut-base-finetuned-docvqa") + +question = "What is the purchase amount?" +image = Image.open("your-document.png") + +pipe(image=image, question=question) + +## [{'answer': '20,000$'}] +``` + +## Useful Resources + +Would you like to learn more about Document QA? Awesome! Here are some curated resources that you may find helpful! + +- [Document Visual Question Answering (DocVQA) challenge](https://rrc.cvc.uab.es/?ch=17) +- [DocVQA: A Dataset for Document Visual Question Answering](https://arxiv.org/abs/2007.00398) (Dataset paper) +- [ICDAR 2021 Competition on Document Visual Question Answering](https://lilianweng.github.io/lil-log/2020/10/29/open-domain-question-answering.html) (Conference paper) +- [HuggingFace's Document Question Answering pipeline](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.DocumentQuestionAnsweringPipeline) +- [Github repo: DocQuery - Document Query Engine Powered by Large Language Models](https://github.com/impira/docquery) + +### Notebooks + +- [Fine-tuning Donut on DocVQA dataset](https://github.com/NielsRogge/Transformers-Tutorials/tree/0ea77f29d01217587d7e32a848f3691d9c15d6ab/Donut/DocVQA) +- [Fine-tuning LayoutLMv2 on DocVQA dataset](https://github.com/NielsRogge/Transformers-Tutorials/tree/1b4bad710c41017d07a8f63b46a12523bfd2e835/LayoutLMv2/DocVQA) +- [Accelerating Document AI](https://huggingface.co/blog/document-ai) + +### Documentation + +- [Document question answering task guide](https://huggingface.co/docs/transformers/tasks/document_question_answering) + +The contents of this page are contributed by [Eliott Zemour](https://huggingface.co/eliolio) and reviewed by [Kwadwo Agyapon-Ntra](https://huggingface.co/KayO) and [Ankur Goyal](https://huggingface.co/ankrgyl). diff --git a/packages/tasks/src/document-question-answering/data.ts b/packages/tasks/src/document-question-answering/data.ts new file mode 100644 index 000000000..275173fa8 --- /dev/null +++ b/packages/tasks/src/document-question-answering/data.ts @@ -0,0 +1,70 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + // TODO write proper description + description: + "Dataset from the 2020 DocVQA challenge. The documents are taken from the UCSF Industry Documents Library.", + id: "eliolio/docvqa", + }, + ], + demo: { + inputs: [ + { + label: "Question", + content: "What is the idea behind the consumer relations efficiency team?", + type: "text", + }, + { + filename: "document-question-answering-input.png", + type: "img", + }, + ], + outputs: [ + { + label: "Answer", + content: "Balance cost efficiency with quality customer service", + type: "text", + }, + ], + }, + metrics: [ + { + description: + "The evaluation metric for the DocVQA challenge is the Average Normalized Levenshtein Similarity (ANLS). This metric is flexible to character regognition errors and compares the predicted answer with the ground truth answer.", + id: "anls", + }, + { + description: + "Exact Match is a metric based on the strict character match of the predicted answer and the right answer. For answers predicted correctly, the Exact Match will be 1. Even if only one character is different, Exact Match will be 0", + id: "exact-match", + }, + ], + models: [ + { + description: "A LayoutLM model for the document QA task, fine-tuned on DocVQA and SQuAD2.0.", + id: "impira/layoutlm-document-qa", + }, + { + description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.", + id: "naver-clova-ix/donut-base-finetuned-docvqa", + }, + ], + spaces: [ + { + description: "A robust document question answering application.", + id: "impira/docquery", + }, + { + description: "An application that can answer questions from invoices.", + id: "impira/invoices", + }, + ], + summary: + "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.", + widgetModels: ["impira/layoutlm-document-qa"], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/feature-extraction/about.md b/packages/tasks/src/feature-extraction/about.md new file mode 100644 index 000000000..60c7c7ed3 --- /dev/null +++ b/packages/tasks/src/feature-extraction/about.md @@ -0,0 +1,34 @@ +## About the Task + +Feature extraction is the task of building features intended to be informative from a given dataset, +facilitating the subsequent learning and generalization steps in various domains of machine learning. + +## Use Cases + +Feature extraction can be used to do transfer learning in natural language processing, computer vision and audio models. + +## Inference + +#### Feature Extraction + +```python +from transformers import pipeline +checkpoint = "facebook/bart-base" +feature_extractor = pipeline("feature-extraction",framework="pt",model=checkpoint) +text = "Transformers is an awesome library!" + +#Reducing along the first dimension to get a 768 dimensional array +feature_extractor(text,return_tensors = "pt")[0].numpy().mean(axis=0) + +'''tensor([[[ 2.5834, 2.7571, 0.9024, ..., 1.5036, -0.0435, -0.8603], + [-1.2850, -1.0094, -2.0826, ..., 1.5993, -0.9017, 0.6426], + [ 0.9082, 0.3896, -0.6843, ..., 0.7061, 0.6517, 1.0550], + ..., + [ 0.6919, -1.1946, 0.2438, ..., 1.3646, -1.8661, -0.1642], + [-0.1701, -2.0019, -0.4223, ..., 0.3680, -1.9704, -0.0068], + [ 0.2520, -0.6869, -1.0582, ..., 0.5198, -2.2106, 0.4547]]])''' +``` + +## Useful resources + +- [Documentation for feature extractor of 🤗Transformers](https://huggingface.co/docs/transformers/main_classes/feature_extractor) diff --git a/packages/tasks/src/feature-extraction/data.ts b/packages/tasks/src/feature-extraction/data.ts new file mode 100644 index 000000000..fe5f1785b --- /dev/null +++ b/packages/tasks/src/feature-extraction/data.ts @@ -0,0 +1,54 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: + "Wikipedia dataset containing cleaned articles of all languages. Can be used to train `feature-extraction` models.", + id: "wikipedia", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "India, officially the Republic of India, is a country in South Asia.", + type: "text", + }, + ], + outputs: [ + { + table: [ + ["Dimension 1", "Dimension 2", "Dimension 3"], + ["2.583383083343506", "2.757075071334839", "0.9023529887199402"], + ["8.29393482208252", "1.1071064472198486", "2.03399395942688"], + ["-0.7754912972450256", "-1.647324562072754", "-0.6113331913948059"], + ["0.07087723910808563", "1.5942802429199219", "1.4610432386398315"], + ], + type: "tabular", + }, + ], + }, + metrics: [ + { + description: "", + id: "", + }, + ], + models: [ + { + description: "A powerful feature extraction model for natural language processing tasks.", + id: "facebook/bart-base", + }, + { + description: "A strong feature extraction model for coding tasks.", + id: "microsoft/codebert-base", + }, + ], + spaces: [], + summary: + "Feature extraction refers to the process of transforming raw data into numerical features that can be processed while preserving the information in the original dataset.", + widgetModels: ["facebook/bart-base"], +}; + +export default taskData; diff --git a/packages/tasks/src/fill-mask/about.md b/packages/tasks/src/fill-mask/about.md new file mode 100644 index 000000000..4fabd3cf6 --- /dev/null +++ b/packages/tasks/src/fill-mask/about.md @@ -0,0 +1,51 @@ +## Use Cases + +### Domain Adaptation 👩‍⚕️ + +Masked language models do not require labelled data! They are trained by masking a couple of words in sentences and the model is expected to guess the masked word. This makes it very practical! + +For example, masked language modeling is used to train large models for domain-specific problems. If you have to work on a domain-specific task, such as retrieving information from medical research papers, you can train a masked language model using those papers. 📄 + +The resulting model has a statistical understanding of the language used in medical research papers, and can be further trained in a process called fine-tuning to solve different tasks, such as [Text Classification](/tasks/text-classification) or [Question Answering](/tasks/question-answering) to build a medical research papers information extraction system. 👩‍⚕️ Pre-training on domain-specific data tends to yield better results (see [this paper](https://arxiv.org/abs/2007.15779) for an example). + +If you don't have the data to train a masked language model, you can also use an existing [domain-specific masked language model](https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext) from the Hub and fine-tune it with your smaller task dataset. That's the magic of Open Source and sharing your work! 🎉 + +## Inference with Fill-Mask Pipeline + +You can use the 🤗 Transformers library `fill-mask` pipeline to do inference with masked language models. If a model name is not provided, the pipeline will be initialized with [distilroberta-base](/distilroberta-base). You can provide masked text and it will return a list of possible mask values ​​ranked according to the score. + +```python +from transformers import pipeline + +classifier = pipeline("fill-mask") +classifier("Paris is the of France.") + +# [{'score': 0.7, 'sequence': 'Paris is the capital of France.'}, +# {'score': 0.2, 'sequence': 'Paris is the birthplace of France.'}, +# {'score': 0.1, 'sequence': 'Paris is the heart of France.'}] +``` + +## Useful Resources + +Would you like to learn more about the topic? Awesome! Here you can find some curated resources that can be helpful to you! + +- [Course Chapter on Fine-tuning a Masked Language Model](https://huggingface.co/course/chapter7/3?fw=pt) +- [Workshop on Pretraining Language Models and CodeParrot](https://www.youtube.com/watch?v=ExUR7w6xe94) +- [BERT 101: State Of The Art NLP Model Explained](https://huggingface.co/blog/bert-101) +- [Nyströmformer: Approximating self-attention in linear time and memory via the Nyström method](https://huggingface.co/blog/nystromformer) + +### Notebooks + +- [Pre-training an MLM for JAX/Flax](https://github.com/huggingface/notebooks/blob/master/examples/masked_language_modeling_flax.ipynb) +- [Masked language modeling in TensorFlow](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling-tf.ipynb) +- [Masked language modeling in PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb) + +### Scripts for training + +- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling) +- [Flax](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling) +- [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling) + +### Documentation + +- [Masked language modeling task guide](https://huggingface.co/docs/transformers/tasks/masked_language_modeling) diff --git a/packages/tasks/src/fill-mask/data.ts b/packages/tasks/src/fill-mask/data.ts new file mode 100644 index 000000000..4e8204b15 --- /dev/null +++ b/packages/tasks/src/fill-mask/data.ts @@ -0,0 +1,79 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A common dataset that is used to train models for many languages.", + id: "wikipedia", + }, + { + description: "A large English dataset with text crawled from the web.", + id: "c4", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "The barked at me", + type: "text", + }, + ], + outputs: [ + { + type: "chart", + data: [ + { + label: "wolf", + score: 0.487, + }, + { + label: "dog", + score: 0.061, + }, + { + label: "cat", + score: 0.058, + }, + { + label: "fox", + score: 0.047, + }, + { + label: "squirrel", + score: 0.025, + }, + ], + }, + ], + }, + metrics: [ + { + description: + "Cross Entropy is a metric that calculates the difference between two probability distributions. Each probability distribution is the distribution of predicted words", + id: "cross_entropy", + }, + { + description: + "Perplexity is the exponential of the cross-entropy loss. It evaluates the probabilities assigned to the next word by the model. Lower perplexity indicates better performance", + id: "perplexity", + }, + ], + models: [ + { + description: "A faster and smaller model than the famous BERT model.", + id: "distilbert-base-uncased", + }, + { + description: "A multilingual model trained on 100 languages.", + id: "xlm-roberta-base", + }, + ], + spaces: [], + summary: + "Masked language modeling is the task of masking some of the words in a sentence and predicting which words should replace those masks. These models are useful when we want to get a statistical understanding of the language in which the model is trained in.", + widgetModels: ["distilroberta-base"], + youtubeId: "mqElG5QJWUg", +}; + +export default taskData; diff --git a/packages/tasks/src/image-classification/about.md b/packages/tasks/src/image-classification/about.md new file mode 100644 index 000000000..593f3b1dd --- /dev/null +++ b/packages/tasks/src/image-classification/about.md @@ -0,0 +1,50 @@ +## Use Cases + +Image classification models can be used when we are not interested in specific instances of objects with location information or their shape. + +### Keyword Classification + +Image classification models are used widely in stock photography to assign each image a keyword. + +### Image Search + +Models trained in image classification can improve user experience by organizing and categorizing photo galleries on the phone or in the cloud, on multiple keywords or tags. + +## Inference + +With the `transformers` library, you can use the `image-classification` pipeline to infer with image classification models. You can initialize the pipeline with a model id from the Hub. If you do not provide a model id it will initialize with [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) by default. When calling the pipeline you just need to specify a path, http link or an image loaded in PIL. You can also provide a `top_k` parameter which determines how many results it should return. + +```python +from transformers import pipeline +clf = pipeline("image-classification") +clf("path_to_a_cat_image") + +[{'label': 'tabby cat', 'score': 0.731}, +... +] +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to classify images using models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.imageClassification({ + data: await (await fetch("https://picsum.photos/300/300")).blob(), + model: "microsoft/resnet-50", +}); +``` + +## Useful Resources + +- [Let's Play Pictionary with Machine Learning!](https://www.youtube.com/watch?v=LS9Y2wDVI0k) +- [Fine-Tune ViT for Image Classification with 🤗Transformers](https://huggingface.co/blog/fine-tune-vit) +- [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8) +- [Computer Vision Study Group: Swin Transformer](https://www.youtube.com/watch?v=Ngikt-K1Ecc) +- [Computer Vision Study Group: Masked Autoencoders Paper Walkthrough](https://www.youtube.com/watch?v=Ngikt-K1Ecc) +- [Image classification task guide](https://huggingface.co/docs/transformers/tasks/image_classification) + +### Creating your own image classifier in just a few minutes + +With [HuggingPics](https://github.com/nateraw/huggingpics), you can fine-tune Vision Transformers for anything using images found on the web. This project downloads images of classes defined by you, trains a model, and pushes it to the Hub. You even get to try out the model directly with a working widget in the browser, ready to be shared with all your friends! diff --git a/packages/tasks/src/image-classification/data.ts b/packages/tasks/src/image-classification/data.ts new file mode 100644 index 000000000..4dcbff4f1 --- /dev/null +++ b/packages/tasks/src/image-classification/data.ts @@ -0,0 +1,88 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + // TODO write proper description + description: "Benchmark dataset used for image classification with images that belong to 100 classes.", + id: "cifar100", + }, + { + // TODO write proper description + description: "Dataset consisting of images of garments.", + id: "fashion_mnist", + }, + ], + demo: { + inputs: [ + { + filename: "image-classification-input.jpeg", + type: "img", + }, + ], + outputs: [ + { + type: "chart", + data: [ + { + label: "Egyptian cat", + score: 0.514, + }, + { + label: "Tabby cat", + score: 0.193, + }, + { + label: "Tiger cat", + score: 0.068, + }, + ], + }, + ], + }, + metrics: [ + { + description: "", + id: "accuracy", + }, + { + description: "", + id: "recall", + }, + { + description: "", + id: "precision", + }, + { + description: "", + id: "f1", + }, + ], + models: [ + { + description: "A strong image classification model.", + id: "google/vit-base-patch16-224", + }, + { + description: "A robust image classification model.", + id: "facebook/deit-base-distilled-patch16-224", + }, + { + description: "A strong image classification model.", + id: "facebook/convnext-large-224", + }, + ], + spaces: [ + { + // TO DO: write description + description: "An application that classifies what a given image is about.", + id: "nielsr/perceiver-image-classification", + }, + ], + summary: + "Image classification is the task of assigning a label or class to an entire image. Images are expected to have only one class for each image. Image classification models take an image as input and return a prediction about which class the image belongs to.", + widgetModels: ["google/vit-base-patch16-224"], + youtubeId: "tjAIM7BOYhw", +}; + +export default taskData; diff --git a/packages/tasks/src/image-segmentation/about.md b/packages/tasks/src/image-segmentation/about.md new file mode 100644 index 000000000..3f26fb8ca --- /dev/null +++ b/packages/tasks/src/image-segmentation/about.md @@ -0,0 +1,63 @@ +## Use Cases + +### Autonomous Driving + +Segmentation models are used to identify road patterns such as lanes and obstacles for safer driving. + +### Background Removal + +Image Segmentation models are used in cameras to erase the background of certain objects and apply filters to them. + +### Medical Imaging + +Image Segmentation models are used to distinguish organs or tissues, improving medical imaging workflows. Models are used to segment dental instances, analyze X-Ray scans or even segment cells for pathological diagnosis. This [dataset](https://github.com/v7labs/covid-19-xray-dataset) contains images of lungs of healthy patients and patients with COVID-19 segmented with masks. Another [segmentation dataset](https://ivdm3seg.weebly.com/data.html) contains segmented MRI data of the lower spine to analyze the effect of spaceflight simulation. + +## Task Variants + +### Semantic Segmentation + +Semantic Segmentation is the task of segmenting parts of an image that belong to the same class. Semantic Segmentation models make predictions for each pixel and return the probabilities of the classes for each pixel. These models are evaluated on Mean Intersection Over Union (Mean IoU). + +### Instance Segmentation + +Instance Segmentation is the variant of Image Segmentation where every distinct object is segmented, instead of one segment per class. + +### Panoptic Segmentation + +Panoptic Segmentation is the Image Segmentation task that segments the image both by instance and by class, assigning each pixel a different instance of the class. + +## Inference + +You can infer with Image Segmentation models using the `image-segmentation` pipeline. You need to install [timm](https://github.com/rwightman/pytorch-image-models) first. + +```python +!pip install timm +model = pipeline("image-segmentation") +model("cat.png") +#[{'label': 'cat', +# 'mask': mask_code, +# 'score': 0.999} +# ...] +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image segmentation models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.imageSegmentation({ + data: await (await fetch("https://picsum.photos/300/300")).blob(), + model: "facebook/detr-resnet-50-panoptic", +}); +``` + +## Useful Resources + +Would you like to learn more about image segmentation? Great! Here you can find some curated resources that you may find helpful! + +- [Fine-Tune a Semantic Segmentation Model with a Custom Dataset](https://huggingface.co/blog/fine-tune-segformer) +- [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8) +- [A Guide on Universal Image Segmentation with Mask2Former and OneFormer](https://huggingface.co/blog/mask2former) +- [Zero-shot image segmentation with CLIPSeg](https://huggingface.co/blog/clipseg-zero-shot) +- [Semantic segmentation task guide](https://huggingface.co/docs/transformers/tasks/semantic_segmentation) diff --git a/packages/tasks/src/image-segmentation/data.ts b/packages/tasks/src/image-segmentation/data.ts new file mode 100644 index 000000000..c6bb835e7 --- /dev/null +++ b/packages/tasks/src/image-segmentation/data.ts @@ -0,0 +1,99 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "Scene segmentation dataset.", + id: "scene_parse_150", + }, + ], + demo: { + inputs: [ + { + filename: "image-segmentation-input.jpeg", + type: "img", + }, + ], + outputs: [ + { + filename: "image-segmentation-output.png", + type: "img", + }, + ], + }, + metrics: [ + { + description: + "Average Precision (AP) is the Area Under the PR Curve (AUC-PR). It is calculated for each semantic class separately", + id: "Average Precision", + }, + { + description: "Mean Average Precision (mAP) is the overall average of the AP values", + id: "Mean Average Precision", + }, + { + description: + "Intersection over Union (IoU) is the overlap of segmentation masks. Mean IoU is the average of the IoU of all semantic classes", + id: "Mean Intersection over Union", + }, + { + description: "APα is the Average Precision at the IoU threshold of a α value, for example, AP50 and AP75", + id: "APα", + }, + ], + models: [ + { + // TO DO: write description + description: "Solid panoptic segmentation model trained on the COCO 2017 benchmark dataset.", + id: "facebook/detr-resnet-50-panoptic", + }, + { + description: "Semantic segmentation model trained on ADE20k benchmark dataset.", + id: "microsoft/beit-large-finetuned-ade-640-640", + }, + { + description: "Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.", + id: "nvidia/segformer-b0-finetuned-ade-512-512", + }, + { + description: "Semantic segmentation model trained Cityscapes dataset.", + id: "facebook/mask2former-swin-large-cityscapes-semantic", + }, + { + description: "Panoptic segmentation model trained COCO (common objects) dataset.", + id: "facebook/mask2former-swin-large-coco-panoptic", + }, + ], + spaces: [ + { + description: "A semantic segmentation application that can predict unseen instances out of the box.", + id: "facebook/ov-seg", + }, + { + description: "One of the strongest segmentation applications.", + id: "jbrinkma/segment-anything", + }, + { + description: "A semantic segmentation application that predicts human silhouettes.", + id: "keras-io/Human-Part-Segmentation", + }, + { + description: "An instance segmentation application to predict neuronal cell types from microscopy images.", + id: "rashmi/sartorius-cell-instance-segmentation", + }, + { + description: "An application that segments videos.", + id: "ArtGAN/Segment-Anything-Video", + }, + { + description: "An panoptic segmentation application built for outdoor environments.", + id: "segments/panoptic-segment-anything", + }, + ], + summary: + "Image Segmentation divides an image into segments where each pixel in the image is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation.", + widgetModels: ["facebook/detr-resnet-50-panoptic"], + youtubeId: "dKE8SIt9C-w", +}; + +export default taskData; diff --git a/packages/tasks/src/image-to-image/about.md b/packages/tasks/src/image-to-image/about.md new file mode 100644 index 000000000..4d6d1695b --- /dev/null +++ b/packages/tasks/src/image-to-image/about.md @@ -0,0 +1,79 @@ +## Use Cases + +### Style transfer + +One of the most popular use cases of image to image is the style transfer. Style transfer models can convert a regular photography into a painting in the style of a famous painter. + +## Task Variants + +### Image inpainting + +Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires or sensor +dust. + +### Image colorization + +Old, black and white images can be brought up to life using an image colorization model. + +### Super Resolution + +Super resolution models increase the resolution of an image, allowing for higher quality viewing and printing. + +## Inference + +You can use pipelines for image-to-image in 🧨diffusers library to easily use image-to-image models. See an example for `StableDiffusionImg2ImgPipeline` below. + +```python +from PIL import Image +from diffusers import StableDiffusionImg2ImgPipeline + +model_id_or_path = "runwayml/stable-diffusion-v1-5" +pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) +pipe = pipe.to(cuda) + +init_image = Image.open("mountains_image.jpeg").convert("RGB").resize((768, 512)) +prompt = "A fantasy landscape, trending on artstation" + +images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images +images[0].save("fantasy_landscape.png") +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-image models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.imageToImage({ + data: await (await fetch("image")).blob(), + model: "timbrooks/instruct-pix2pix", + parameters: { + prompt: "Deblur this image", + }, +}); +``` + +## ControlNet + +Controlling outputs of diffusion models only with a text prompt is a challenging problem. ControlNet is a neural network type that provides an image based control to diffusion models. These controls can be edges or landmarks in an image. + +Many ControlNet models were trained in our community event, JAX Diffusers sprint. You can see the full list of the ControlNet models available [here](https://huggingface.co/spaces/jax-diffusers-event/leaderboard). + +## Most Used Model for the Task + +Pix2Pix is a popular model used for image to image translation tasks. It is based on a conditional-GAN (generative adversarial network) where instead of a noise vector a 2D image is given as input. More information about Pix2Pix can be retrieved from this [link](https://phillipi.github.io/pix2pix/) where the associated paper and the GitHub repository can be found. + +Below images show some of the examples shared in the paper that can be obtained using Pix2Pix. There are various cases this model can be applied on. It is capable of relatively simpler things, e.g. converting a grayscale image to its colored version. But more importantly, it can generate realistic pictures from rough sketches (can be seen in the purse example) or from painting-like images (can be seen in the street and facade examples below). + +Alt text + +## Useful Resources + +- [Train your ControlNet with diffusers 🧨](https://huggingface.co/blog/train-your-controlnet) +- [Ultra fast ControlNet with 🧨 Diffusers](https://huggingface.co/blog/controlnet) + +## References + +[1] P. Isola, J. -Y. Zhu, T. Zhou and A. A. Efros, "Image-to-Image Translation with Conditional Adversarial Networks," 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2017, pp. 5967-5976, doi: 10.1109/CVPR.2017.632. + +This page was made possible thanks to the efforts of [Paul Gafton](https://github.com/Paul92) and [Osman Alenbey](https://huggingface.co/osman93). diff --git a/packages/tasks/src/image-to-image/data.ts b/packages/tasks/src/image-to-image/data.ts new file mode 100644 index 000000000..9688dc249 --- /dev/null +++ b/packages/tasks/src/image-to-image/data.ts @@ -0,0 +1,101 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "Synthetic dataset, for image relighting", + id: "VIDIT", + }, + { + description: "Multiple images of celebrities, used for facial expression translation", + id: "huggan/CelebA-faces", + }, + ], + demo: { + inputs: [ + { + filename: "image-to-image-input.jpeg", + type: "img", + }, + ], + outputs: [ + { + filename: "image-to-image-output.png", + type: "img", + }, + ], + }, + isPlaceholder: false, + metrics: [ + { + description: + "Peak Signal to Noise Ratio (PSNR) is an approximation of the human perception, considering the ratio of the absolute intensity with respect to the variations. Measured in dB, a high value indicates a high fidelity.", + id: "PSNR", + }, + { + description: + "Structural Similarity Index (SSIM) is a perceptual metric which compares the luminance, contrast and structure of two images. The values of SSIM range between -1 and 1, and higher values indicate closer resemblance to the original image.", + id: "SSIM", + }, + { + description: + "Inception Score (IS) is an analysis of the labels predicted by an image classification model when presented with a sample of the generated images.", + id: "IS", + }, + ], + models: [ + { + description: "A model that enhances images captured in low light conditions.", + id: "keras-io/low-light-image-enhancement", + }, + { + description: "A model that increases the resolution of an image.", + id: "keras-io/super-resolution", + }, + { + description: + "A model that creates a set of variations of the input image in the style of DALL-E using Stable Diffusion.", + id: "lambdalabs/sd-image-variations-diffusers", + }, + { + description: "A model that generates images based on segments in the input image and the text prompt.", + id: "mfidabel/controlnet-segment-anything", + }, + { + description: "A model that takes an image and an instruction to edit the image.", + id: "timbrooks/instruct-pix2pix", + }, + ], + spaces: [ + { + description: "Image enhancer application for low light.", + id: "keras-io/low-light-image-enhancement", + }, + { + description: "Style transfer application.", + id: "keras-io/neural-style-transfer", + }, + { + description: "An application that generates images based on segment control.", + id: "mfidabel/controlnet-segment-anything", + }, + { + description: "Image generation application that takes image control and text prompt.", + id: "hysts/ControlNet", + }, + { + description: "Colorize any image using this app.", + id: "ioclab/brightness-controlnet", + }, + { + description: "Edit images with instructions.", + id: "timbrooks/instruct-pix2pix", + }, + ], + summary: + "Image-to-image is the task of transforming a source image to match the characteristics of a target image or a target image domain. Any image manipulation and enhancement is possible with image to image models.", + widgetModels: ["lllyasviel/sd-controlnet-canny"], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/image-to-text/about.md b/packages/tasks/src/image-to-text/about.md new file mode 100644 index 000000000..a209ae22b --- /dev/null +++ b/packages/tasks/src/image-to-text/about.md @@ -0,0 +1,65 @@ +## Use Cases + +### Image Captioning + +Image Captioning is the process of generating textual description of an image. +This can help the visually impaired people to understand what's happening in their surroundings. + +### Optical Character Recognition (OCR) + +OCR models convert the text present in an image, e.g. a scanned document, to text. + +## Pix2Struct + +Pix2Struct is a state-of-the-art model built and released by Google AI. The model itself has to be trained on a downstream task to be used. These tasks include, captioning UI components, images including text, visual questioning infographics, charts, scientific diagrams and more. You can find these models on recommended models of this page. + +## Inference + +### Image Captioning + +You can use the 🤗 Transformers library's `image-to-text` pipeline to generate caption for the Image input. + +```python +from transformers import pipeline + +captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base") +captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parrots.png") +## [{'generated_text': 'two birds are standing next to each other '}] +``` + +### OCR + +This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images. + +```python +from transformers import TrOCRProcessor, VisionEncoderDecoderModel + +processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten') +model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten') +pixel_values = processor(images="image.jpeg", return_tensors="pt").pixel_values + +generated_ids = model.generate(pixel_values) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-text models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.imageToText({ + data: await (await fetch("https://picsum.photos/300/300")).blob(), + model: "Salesforce/blip-image-captioning-base", +}); +``` + +## Useful Resources + +- [Image Captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning) +- [Image captioning use case](https://blog.google/outreach-initiatives/accessibility/get-image-descriptions/) +- [Train Image Captioning model on your dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb) +- [Train OCR model on your dataset ](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR) + +This page was made possible thanks to efforts of [Sukesh Perla](https://huggingface.co/hitchhiker3010) and [Johannes Kolbe](https://huggingface.co/johko). diff --git a/packages/tasks/src/image-to-text/data.ts b/packages/tasks/src/image-to-text/data.ts new file mode 100644 index 000000000..6a838ebea --- /dev/null +++ b/packages/tasks/src/image-to-text/data.ts @@ -0,0 +1,86 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + // TODO write proper description + description: "Dataset from 12M image-text of Reddit", + id: "red_caps", + }, + { + // TODO write proper description + description: "Dataset from 3.3M images of Google", + id: "datasets/conceptual_captions", + }, + ], + demo: { + inputs: [ + { + filename: "savanna.jpg", + type: "img", + }, + ], + outputs: [ + { + label: "Detailed description", + content: "a herd of giraffes and zebras grazing in a field", + type: "text", + }, + ], + }, + metrics: [], + models: [ + { + description: "A robust image captioning model.", + id: "Salesforce/blip-image-captioning-large", + }, + { + description: "A strong image captioning model.", + id: "nlpconnect/vit-gpt2-image-captioning", + }, + { + description: "A strong optical character recognition model.", + id: "microsoft/trocr-base-printed", + }, + { + description: "A strong visual question answering model for scientific diagrams.", + id: "google/pix2struct-ai2d-base", + }, + { + description: "A strong captioning model for UI components.", + id: "google/pix2struct-widget-captioning-base", + }, + { + description: "A captioning model for images that contain text.", + id: "google/pix2struct-textcaps-base", + }, + ], + spaces: [ + { + description: "A robust image captioning application.", + id: "flax-community/image-captioning", + }, + { + description: "An application that transcribes handwritings into text.", + id: "nielsr/TrOCR-handwritten", + }, + { + description: "An application that can caption images and answer questions about a given image.", + id: "Salesforce/BLIP", + }, + { + description: "An application that can caption images and answer questions with a conversational agent.", + id: "Salesforce/BLIP2", + }, + { + description: "An image captioning application that demonstrates the effect of noise on captions.", + id: "johko/capdec-image-captioning", + }, + ], + summary: + "Image to text models output a text from a given image. Image captioning or optical character recognition can be considered as the most common applications of image to text.", + widgetModels: ["Salesforce/blip-image-captioning-base"], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/index.ts b/packages/tasks/src/index.ts new file mode 100644 index 000000000..0519b7d96 --- /dev/null +++ b/packages/tasks/src/index.ts @@ -0,0 +1,4 @@ +export type { TaskData, TaskDemo, TaskDemoEntry, ExampleRepo } from "./Types"; +export { TASKS_DATA } from "./tasksData"; +export { PIPELINE_DATA, type PipelineType, type PipelineData, type Modality, MODALITIES } from "./pipelines"; +export { ModelLibrary } from "./modelLibraries"; diff --git a/packages/tasks/src/modelLibraries.ts b/packages/tasks/src/modelLibraries.ts new file mode 100644 index 000000000..6d76980f5 --- /dev/null +++ b/packages/tasks/src/modelLibraries.ts @@ -0,0 +1,43 @@ +/** + * Add your new library here. + * + * This is for modeling (= architectures) libraries, not for file formats (like ONNX, etc). + * File formats live in an enum inside the internal codebase. + */ +export enum ModelLibrary { + "adapter-transformers" = "Adapter Transformers", + "allennlp" = "allenNLP", + "asteroid" = "Asteroid", + "bertopic" = "BERTopic", + "diffusers" = "Diffusers", + "doctr" = "docTR", + "espnet" = "ESPnet", + "fairseq" = "Fairseq", + "flair" = "Flair", + "keras" = "Keras", + "k2" = "K2", + "nemo" = "NeMo", + "open_clip" = "OpenCLIP", + "paddlenlp" = "PaddleNLP", + "peft" = "PEFT", + "pyannote-audio" = "pyannote.audio", + "sample-factory" = "Sample Factory", + "sentence-transformers" = "Sentence Transformers", + "sklearn" = "Scikit-learn", + "spacy" = "spaCy", + "span-marker" = "SpanMarker", + "speechbrain" = "speechbrain", + "tensorflowtts" = "TensorFlowTTS", + "timm" = "Timm", + "fastai" = "fastai", + "transformers" = "Transformers", + "transformers.js" = "Transformers.js", + "stanza" = "Stanza", + "fasttext" = "fastText", + "stable-baselines3" = "Stable-Baselines3", + "ml-agents" = "ML-Agents", + "pythae" = "Pythae", + "mindspore" = "MindSpore", +} + +export type ModelLibraryKey = keyof typeof ModelLibrary; diff --git a/packages/tasks/src/object-detection/about.md b/packages/tasks/src/object-detection/about.md new file mode 100644 index 000000000..4dda21224 --- /dev/null +++ b/packages/tasks/src/object-detection/about.md @@ -0,0 +1,37 @@ +## Use Cases + +### Autonomous Driving + +Object Detection is widely used in computer vision for autonomous driving. Self-driving cars use Object Detection models to detect pedestrians, bicycles, traffic lights and road signs to decide which step to take. + +### Object Tracking in Matches + +Object Detection models are widely used in sports where the ball or a player is tracked for monitoring and refereeing during matches. + +### Image Search + +Object Detection models are widely used in image search. Smartphones use Object Detection models to detect entities (such as specific places or objects) and allow the user to search for the entity on the Internet. + +### Object Counting + +Object Detection models are used to count instances of objects in a given image, this can include counting the objects in warehouses or stores, or counting the number of visitors in a store. They are also used to manage crowds at events to prevent disasters. + +## Inference + +You can infer with Object Detection models through the `object-detection` pipeline. When calling the pipeline you just need to specify a path or http link to an image. + +```python +model = pipeline("object-detection") + +model("path_to_cat_image") + +# [{'label': 'blanket', +# 'mask': mask_string, +# 'score': 0.917}, +#...] +``` + +# Useful Resources + +- [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8) +- [Object detection task guide](https://huggingface.co/docs/transformers/tasks/object_detection) diff --git a/packages/tasks/src/object-detection/data.ts b/packages/tasks/src/object-detection/data.ts new file mode 100644 index 000000000..8ffe8ea1b --- /dev/null +++ b/packages/tasks/src/object-detection/data.ts @@ -0,0 +1,76 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + // TODO write proper description + description: "Widely used benchmark dataset for multiple Vision tasks.", + id: "merve/coco2017", + }, + ], + demo: { + inputs: [ + { + filename: "object-detection-input.jpg", + type: "img", + }, + ], + outputs: [ + { + filename: "object-detection-output.jpg", + type: "img", + }, + ], + }, + metrics: [ + { + description: + "The Average Precision (AP) metric is the Area Under the PR Curve (AUC-PR). It is calculated for each class separately", + id: "Average Precision", + }, + { + description: "The Mean Average Precision (mAP) metric is the overall average of the AP values", + id: "Mean Average Precision", + }, + { + description: + "The APα metric is the Average Precision at the IoU threshold of a α value, for example, AP50 and AP75", + id: "APα", + }, + ], + models: [ + { + // TO DO: write description + description: "Solid object detection model trained on the benchmark dataset COCO 2017.", + id: "facebook/detr-resnet-50", + }, + { + description: "Strong object detection model trained on ImageNet-21k dataset.", + id: "microsoft/beit-base-patch16-224-pt22k-ft22k", + }, + ], + spaces: [ + { + description: "An object detection application that can detect unseen objects out of the box.", + id: "adirik/OWL-ViT", + }, + { + description: "An application that contains various object detection models to try from.", + id: "Gradio-Blocks/Object-Detection-With-DETR-and-YOLOS", + }, + { + description: "An application that shows multiple cutting edge techniques for object detection and tracking.", + id: "kadirnar/torchyolo", + }, + { + description: "An object tracking, segmentation and inpainting application.", + id: "VIPLab/Track-Anything", + }, + ], + summary: + "Object Detection models allow users to identify objects of certain defined classes. Object detection models receive an image as input and output the images with bounding boxes and labels on detected objects.", + widgetModels: ["facebook/detr-resnet-50"], + youtubeId: "WdAeKSOpxhw", +}; + +export default taskData; diff --git a/packages/tasks/src/pipelines.ts b/packages/tasks/src/pipelines.ts new file mode 100644 index 000000000..ab22782d5 --- /dev/null +++ b/packages/tasks/src/pipelines.ts @@ -0,0 +1,608 @@ +export const MODALITIES = ["cv", "nlp", "audio", "tabular", "multimodal", "rl", "other"] as const; + +export type Modality = (typeof MODALITIES)[number]; + +/** + * Public interface for a sub task. + * + * This can be used in a model card's `model-index` metadata. + * and is more granular classification that can grow significantly + * over time as new tasks are added. + */ +export interface SubTask { + /** + * type of the task (e.g. audio-source-separation) + */ + type: string; + /** + * displayed name of the task (e.g. Audio Source Separation) + */ + name: string; +} + +/** + * Public interface for a PipelineData. + * + * This information corresponds to a pipeline type (aka task) + * in the Hub. + */ +export interface PipelineData { + /** + * displayed name of the task (e.g. Text Classification) + */ + name: string; + subtasks?: SubTask[]; + modality: Modality; + /** + * color for the tag icon. + */ + color: "blue" | "green" | "indigo" | "orange" | "red" | "yellow"; + /** + * whether to hide in /models filters + */ + hideInModels?: boolean; + /** + * whether to hide in /datasets filters + */ + hideInDatasets?: boolean; +} + +/// Coarse-grained taxonomy of tasks +/// +/// This type is used in multiple places in the Hugging Face +/// ecosystem: +/// - To determine which widget to show. +/// - To determine which endpoint of Inference API to use. +/// - As filters at the left of models and datasets page. +/// +/// Note that this is sensitive to order. +/// For each domain, the order should be of decreasing specificity. +/// This will impact the default pipeline tag of a model when not +/// specified. +export const PIPELINE_DATA = { + "text-classification": { + name: "Text Classification", + subtasks: [ + { + type: "acceptability-classification", + name: "Acceptability Classification", + }, + { + type: "entity-linking-classification", + name: "Entity Linking Classification", + }, + { + type: "fact-checking", + name: "Fact Checking", + }, + { + type: "intent-classification", + name: "Intent Classification", + }, + { + type: "language-identification", + name: "Language Identification", + }, + { + type: "multi-class-classification", + name: "Multi Class Classification", + }, + { + type: "multi-label-classification", + name: "Multi Label Classification", + }, + { + type: "multi-input-text-classification", + name: "Multi-input Text Classification", + }, + { + type: "natural-language-inference", + name: "Natural Language Inference", + }, + { + type: "semantic-similarity-classification", + name: "Semantic Similarity Classification", + }, + { + type: "sentiment-classification", + name: "Sentiment Classification", + }, + { + type: "topic-classification", + name: "Topic Classification", + }, + { + type: "semantic-similarity-scoring", + name: "Semantic Similarity Scoring", + }, + { + type: "sentiment-scoring", + name: "Sentiment Scoring", + }, + { + type: "sentiment-analysis", + name: "Sentiment Analysis", + }, + { + type: "hate-speech-detection", + name: "Hate Speech Detection", + }, + { + type: "text-scoring", + name: "Text Scoring", + }, + ], + modality: "nlp", + color: "orange", + }, + "token-classification": { + name: "Token Classification", + subtasks: [ + { + type: "named-entity-recognition", + name: "Named Entity Recognition", + }, + { + type: "part-of-speech", + name: "Part of Speech", + }, + { + type: "parsing", + name: "Parsing", + }, + { + type: "lemmatization", + name: "Lemmatization", + }, + { + type: "word-sense-disambiguation", + name: "Word Sense Disambiguation", + }, + { + type: "coreference-resolution", + name: "Coreference-resolution", + }, + ], + modality: "nlp", + color: "blue", + }, + "table-question-answering": { + name: "Table Question Answering", + modality: "nlp", + color: "green", + }, + "question-answering": { + name: "Question Answering", + subtasks: [ + { + type: "extractive-qa", + name: "Extractive QA", + }, + { + type: "open-domain-qa", + name: "Open Domain QA", + }, + { + type: "closed-domain-qa", + name: "Closed Domain QA", + }, + ], + modality: "nlp", + color: "blue", + }, + "zero-shot-classification": { + name: "Zero-Shot Classification", + modality: "nlp", + color: "yellow", + }, + translation: { + name: "Translation", + modality: "nlp", + color: "green", + }, + summarization: { + name: "Summarization", + subtasks: [ + { + type: "news-articles-summarization", + name: "News Articles Summarization", + }, + { + type: "news-articles-headline-generation", + name: "News Articles Headline Generation", + }, + ], + modality: "nlp", + color: "indigo", + }, + conversational: { + name: "Conversational", + subtasks: [ + { + type: "dialogue-generation", + name: "Dialogue Generation", + }, + ], + modality: "nlp", + color: "green", + }, + "feature-extraction": { + name: "Feature Extraction", + modality: "multimodal", + color: "red", + }, + "text-generation": { + name: "Text Generation", + subtasks: [ + { + type: "dialogue-modeling", + name: "Dialogue Modeling", + }, + { + type: "language-modeling", + name: "Language Modeling", + }, + ], + modality: "nlp", + color: "indigo", + }, + "text2text-generation": { + name: "Text2Text Generation", + subtasks: [ + { + type: "text-simplification", + name: "Text simplification", + }, + { + type: "explanation-generation", + name: "Explanation Generation", + }, + { + type: "abstractive-qa", + name: "Abstractive QA", + }, + { + type: "open-domain-abstractive-qa", + name: "Open Domain Abstractive QA", + }, + { + type: "closed-domain-qa", + name: "Closed Domain QA", + }, + { + type: "open-book-qa", + name: "Open Book QA", + }, + { + type: "closed-book-qa", + name: "Closed Book QA", + }, + ], + modality: "nlp", + color: "indigo", + }, + "fill-mask": { + name: "Fill-Mask", + subtasks: [ + { + type: "slot-filling", + name: "Slot Filling", + }, + { + type: "masked-language-modeling", + name: "Masked Language Modeling", + }, + ], + modality: "nlp", + color: "red", + }, + "sentence-similarity": { + name: "Sentence Similarity", + modality: "nlp", + color: "yellow", + }, + "text-to-speech": { + name: "Text-to-Speech", + modality: "audio", + color: "yellow", + }, + "text-to-audio": { + name: "Text-to-Audio", + modality: "audio", + color: "yellow", + }, + "automatic-speech-recognition": { + name: "Automatic Speech Recognition", + modality: "audio", + color: "yellow", + }, + "audio-to-audio": { + name: "Audio-to-Audio", + modality: "audio", + color: "blue", + }, + "audio-classification": { + name: "Audio Classification", + subtasks: [ + { + type: "keyword-spotting", + name: "Keyword Spotting", + }, + { + type: "speaker-identification", + name: "Speaker Identification", + }, + { + type: "audio-intent-classification", + name: "Audio Intent Classification", + }, + { + type: "audio-emotion-recognition", + name: "Audio Emotion Recognition", + }, + { + type: "audio-language-identification", + name: "Audio Language Identification", + }, + ], + modality: "audio", + color: "green", + }, + "voice-activity-detection": { + name: "Voice Activity Detection", + modality: "audio", + color: "red", + }, + "depth-estimation": { + name: "Depth Estimation", + modality: "cv", + color: "yellow", + }, + "image-classification": { + name: "Image Classification", + subtasks: [ + { + type: "multi-label-image-classification", + name: "Multi Label Image Classification", + }, + { + type: "multi-class-image-classification", + name: "Multi Class Image Classification", + }, + ], + modality: "cv", + color: "blue", + }, + "object-detection": { + name: "Object Detection", + subtasks: [ + { + type: "face-detection", + name: "Face Detection", + }, + { + type: "vehicle-detection", + name: "Vehicle Detection", + }, + ], + modality: "cv", + color: "yellow", + }, + "image-segmentation": { + name: "Image Segmentation", + subtasks: [ + { + type: "instance-segmentation", + name: "Instance Segmentation", + }, + { + type: "semantic-segmentation", + name: "Semantic Segmentation", + }, + { + type: "panoptic-segmentation", + name: "Panoptic Segmentation", + }, + ], + modality: "cv", + color: "green", + }, + "text-to-image": { + name: "Text-to-Image", + modality: "multimodal", + color: "yellow", + }, + "image-to-text": { + name: "Image-to-Text", + subtasks: [ + { + type: "image-captioning", + name: "Image Captioning", + }, + ], + modality: "multimodal", + color: "red", + }, + "image-to-image": { + name: "Image-to-Image", + modality: "cv", + color: "indigo", + }, + "unconditional-image-generation": { + name: "Unconditional Image Generation", + modality: "cv", + color: "green", + }, + "video-classification": { + name: "Video Classification", + modality: "cv", + color: "blue", + }, + "reinforcement-learning": { + name: "Reinforcement Learning", + modality: "rl", + color: "red", + }, + robotics: { + name: "Robotics", + modality: "rl", + subtasks: [ + { + type: "grasping", + name: "Grasping", + }, + { + type: "task-planning", + name: "Task Planning", + }, + ], + color: "blue", + }, + "tabular-classification": { + name: "Tabular Classification", + modality: "tabular", + subtasks: [ + { + type: "tabular-multi-class-classification", + name: "Tabular Multi Class Classification", + }, + { + type: "tabular-multi-label-classification", + name: "Tabular Multi Label Classification", + }, + ], + color: "blue", + }, + "tabular-regression": { + name: "Tabular Regression", + modality: "tabular", + subtasks: [ + { + type: "tabular-single-column-regression", + name: "Tabular Single Column Regression", + }, + ], + color: "blue", + }, + "tabular-to-text": { + name: "Tabular to Text", + modality: "tabular", + subtasks: [ + { + type: "rdf-to-text", + name: "RDF to text", + }, + ], + color: "blue", + hideInModels: true, + }, + "table-to-text": { + name: "Table to Text", + modality: "nlp", + color: "blue", + hideInModels: true, + }, + "multiple-choice": { + name: "Multiple Choice", + subtasks: [ + { + type: "multiple-choice-qa", + name: "Multiple Choice QA", + }, + { + type: "multiple-choice-coreference-resolution", + name: "Multiple Choice Coreference Resolution", + }, + ], + modality: "nlp", + color: "blue", + hideInModels: true, + }, + "text-retrieval": { + name: "Text Retrieval", + subtasks: [ + { + type: "document-retrieval", + name: "Document Retrieval", + }, + { + type: "utterance-retrieval", + name: "Utterance Retrieval", + }, + { + type: "entity-linking-retrieval", + name: "Entity Linking Retrieval", + }, + { + type: "fact-checking-retrieval", + name: "Fact Checking Retrieval", + }, + ], + modality: "nlp", + color: "indigo", + hideInModels: true, + }, + "time-series-forecasting": { + name: "Time Series Forecasting", + modality: "tabular", + subtasks: [ + { + type: "univariate-time-series-forecasting", + name: "Univariate Time Series Forecasting", + }, + { + type: "multivariate-time-series-forecasting", + name: "Multivariate Time Series Forecasting", + }, + ], + color: "blue", + hideInModels: true, + }, + "text-to-video": { + name: "Text-to-Video", + modality: "multimodal", + color: "green", + }, + "visual-question-answering": { + name: "Visual Question Answering", + subtasks: [ + { + type: "visual-question-answering", + name: "Visual Question Answering", + }, + ], + modality: "multimodal", + color: "red", + }, + "document-question-answering": { + name: "Document Question Answering", + subtasks: [ + { + type: "document-question-answering", + name: "Document Question Answering", + }, + ], + modality: "multimodal", + color: "blue", + hideInDatasets: true, + }, + "zero-shot-image-classification": { + name: "Zero-Shot Image Classification", + modality: "cv", + color: "yellow", + }, + "graph-ml": { + name: "Graph Machine Learning", + modality: "multimodal", + color: "green", + }, + other: { + name: "Other", + modality: "other", + color: "blue", + hideInModels: true, + hideInDatasets: true, + }, +} satisfies Record; + +export type PipelineType = keyof typeof PIPELINE_DATA; diff --git a/packages/tasks/src/placeholder/about.md b/packages/tasks/src/placeholder/about.md new file mode 100644 index 000000000..fdb455844 --- /dev/null +++ b/packages/tasks/src/placeholder/about.md @@ -0,0 +1,15 @@ +## Use Cases + +You can contribute this area with common use cases of the task! + +## Task Variants + +This place can be filled with variants of this task if there's any. + +## Inference + +This section should have useful information about how to pull a model from Hugging Face Hub that is a part of a library specialized in a task and use it. + +## Useful Resources + +In this area, you can insert useful resources about how to train or use a model for this task. diff --git a/packages/tasks/src/placeholder/data.ts b/packages/tasks/src/placeholder/data.ts new file mode 100644 index 000000000..3660b5219 --- /dev/null +++ b/packages/tasks/src/placeholder/data.ts @@ -0,0 +1,18 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [], + demo: { + inputs: [], + outputs: [], + }, + isPlaceholder: true, + metrics: [], + models: [], + spaces: [], + summary: "", + widgetModels: [], + youtubeId: undefined, +}; + +export default taskData; diff --git a/packages/tasks/src/question-answering/about.md b/packages/tasks/src/question-answering/about.md new file mode 100644 index 000000000..d5934ee80 --- /dev/null +++ b/packages/tasks/src/question-answering/about.md @@ -0,0 +1,56 @@ +## Use Cases + +### Frequently Asked Questions + +You can use Question Answering (QA) models to automate the response to frequently asked questions by using a knowledge base (documents) as context. Answers to customer questions can be drawn from those documents. + +⚡⚡ If you’d like to save inference time, you can first use [passage ranking models](/tasks/sentence-similarity) to see which document might contain the answer to the question and iterate over that document with the QA model instead. + +## Task Variants +There are different QA variants based on the inputs and outputs: + +- **Extractive QA:** The model **extracts** the answer from a context. The context here could be a provided text, a table or even HTML! This is usually solved with BERT-like models. +- **Open Generative QA:** The model **generates** free text directly based on the context. You can learn more about the Text Generation task in [its page](/tasks/text-generation). +- **Closed Generative QA:** In this case, no context is provided. The answer is completely generated by a model. + +The schema above illustrates extractive, open book QA. The model takes a context and the question and extracts the answer from the given context. + +You can also differentiate QA models depending on whether they are open-domain or closed-domain. Open-domain models are not restricted to a specific domain, while closed-domain models are restricted to a specific domain (e.g. legal, medical documents). + +## Inference + +You can infer with QA models with the 🤗 Transformers library using the `question-answering` pipeline. If no model checkpoint is given, the pipeline will be initialized with `distilbert-base-cased-distilled-squad`. This pipeline takes a question and a context from which the answer will be extracted and returned. + +```python +from transformers import pipeline + +qa_model = pipeline("question-answering") +question = "Where do I live?" +context = "My name is Merve and I live in İstanbul." +qa_model(question = question, context = context) +## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31} +``` + +## Useful Resources + +Would you like to learn more about QA? Awesome! Here are some curated resources that you may find helpful! + +- [Course Chapter on Question Answering](https://huggingface.co/course/chapter7/7?fw=pt) +- [Question Answering Workshop](https://www.youtube.com/watch?v=Ihgk8kGLpIE&ab_channel=HuggingFace) +- [How to Build an Open-Domain Question Answering System?](https://lilianweng.github.io/lil-log/2020/10/29/open-domain-question-answering.html) +- [Blog Post: ELI5 A Model for Open Domain Long Form Question Answering](https://yjernite.github.io/lfqa.html) + +### Notebooks + +- [PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb) +- [TensorFlow](https://github.com/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb) + +### Scripts for training + +- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) +- [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) +- [Flax](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering) + +### Documentation + +- [Question answering task guide](https://huggingface.co/docs/transformers/tasks/question_answering) diff --git a/packages/tasks/src/question-answering/data.ts b/packages/tasks/src/question-answering/data.ts new file mode 100644 index 000000000..dee5ccf64 --- /dev/null +++ b/packages/tasks/src/question-answering/data.ts @@ -0,0 +1,71 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + // TODO write proper description + description: "A famous question answering dataset based on English articles from Wikipedia.", + id: "squad_v2", + }, + { + // TODO write proper description + description: "A dataset of aggregated anonymized actual queries issued to the Google search engine.", + id: "natural_questions", + }, + ], + demo: { + inputs: [ + { + label: "Question", + content: "Which name is also used to describe the Amazon rainforest in English?", + type: "text", + }, + { + label: "Context", + content: "The Amazon rainforest, also known in English as Amazonia or the Amazon Jungle", + type: "text", + }, + ], + outputs: [ + { + label: "Answer", + content: "Amazonia", + type: "text", + }, + ], + }, + metrics: [ + { + description: + "Exact Match is a metric based on the strict character match of the predicted answer and the right answer. For answers predicted correctly, the Exact Match will be 1. Even if only one character is different, Exact Match will be 0", + id: "exact-match", + }, + { + description: + " The F1-Score metric is useful if we value both false positives and false negatives equally. The F1-Score is calculated on each word in the predicted sequence against the correct answer", + id: "f1", + }, + ], + models: [ + { + description: "A robust baseline model for most question answering domains.", + id: "deepset/roberta-base-squad2", + }, + { + description: "A special model that can answer questions from tables!", + id: "google/tapas-base-finetuned-wtq", + }, + ], + spaces: [ + { + description: "An application that can answer a long question from Wikipedia.", + id: "deepset/wikipedia-assistant", + }, + ], + summary: + "Question Answering models can retrieve the answer to a question from a given text, which is useful for searching for an answer in a document. Some question answering models can generate answers without context!", + widgetModels: ["deepset/roberta-base-squad2"], + youtubeId: "ajPx5LwJD-I", +}; + +export default taskData; diff --git a/packages/tasks/src/reinforcement-learning/about.md b/packages/tasks/src/reinforcement-learning/about.md new file mode 100644 index 000000000..13f79cfff --- /dev/null +++ b/packages/tasks/src/reinforcement-learning/about.md @@ -0,0 +1,167 @@ +## Use Cases + +### Gaming + +Reinforcement learning is known for its application to video games. Since the games provide a safe environment for the agent to be trained in the sense that it is perfectly defined and controllable, this makes them perfect candidates for experimentation and will help a lot to learn about the capabilities and limitations of various RL algorithms. + +There are many videos on the Internet where a game-playing reinforcement learning agent starts with a terrible gaming strategy due to random initialization of its settings, but over iterations, the agent gets better and better with each episode of the training. This [paper](https://arxiv.org/abs/1912.10944) mainly investigates the performance of RL in popular games such as Minecraft or Dota2. The agent's performance can exceed a human player's, although there are still some challenges mainly related to efficiency in constructing the gaming policy of the reinforcement learning agent. + +### Trading and Finance + +Reinforcement learning is the science to train computers to make decisions and thus has a novel use in trading and finance. All time-series models are helpful in predicting prices, volume and future sales of a product or a stock. Reinforcement based automated agents can decide to sell, buy or hold a stock. It shifts the impact of AI in this field to real time decision making rather than just prediction of prices. The glossary given below will clear some parameters to as to how we can train a model to take these decisions. + +## Task Variants + +### Model Based RL + +In model based reinforcement learning techniques intend to create a model of the environment, learn the state transition probabilities and the reward function, to find the optimal action. Some typical examples for model based reinforcement learning algorithms are dynamic programming, value iteration and policy iteration. + +### Model Free RL + +In model free reinforcement learning, agent decides on optimal actions based on its experience in the environment and the reward it collects from it. This is one of the most commonly used algorithms beneficial in complex environments, where modeling of state transition probabilities and reward functions are difficult. Some of the examples of model free reinforcement learning are SARSA, Q-Learning, actor-critic and proximal policy optimization (PPO) algorithms. + +## Glossary + + + +**Agent:** The learner and the decision maker. + +**Environment:** The part of the world the agent interacts, comprising everything outside the agent. + +Observations and states are the information our agent gets from the environment. In the case of a video game, it can be a frame (a screenshot). In the case of the trading agent, it can be the value of a certain stock. + +**State:** Complete description of the state of the environment with no hidden information. + +**Observation:** Partial description of the state, in a partially observed environment. + +**Action:** The decision taken by the agent. + +**Reward:** The numerical feedback signal that the agent receives from the environment based on the chosen action. + +**Return:** Cumulative Reward. In the simplest case, the return is the sum of the rewards. + +**Episode:** For some applications there is a natural notion of final time step. In this case, there is a starting point and an ending point (a terminal state). This creates an episode: a list of States, Actions, Rewards, and new States. For instance, think about Chess: an episode begins at the initial board position and ends when the game is over. + +**Policy:** The Policy is the brain of the Agent, it’s the function that tells what action to take given the state. So it defines the agent’s behavior at a given time. Reinforcement learning methods specify how the agent’s policy is changed as a result of its experience. + +## Inference + +Inference in reinforcement learning differs from other modalities, in which there's a model and test data. In reinforcement learning, once you have trained an agent in an environment, you try to run the trained agent for additional steps to get the average reward. + +A typical training cycle consists of gathering experience from the environment, training the agent, and running the agent on a test environment to obtain average reward. Below there's a snippet on how you can interact with the environment using the `gymnasium` library, train an agent using `stable-baselines3`, evalute the agent on test environment and infer actions from the trained agent. + +```python +# Here we are running 20 episodes of CartPole-v1 environment, taking random actions +import gymnasium as gym + +env = gym.make("CartPole-v1") +observation, info = env.reset() + +for _ in range(20): + action = env.action_space.sample() # samples random action from action sample space + + # the agent takes the action + observation, reward, terminated, truncated, info = env.step(action) + + +# if the agent reaches terminal state, we reset the environment +if terminated or truncated: + + print("Environment is reset") + observation = env.reset() + +env.close() +``` + +Below snippet shows how to train a PPO model on LunarLander-v2 environment using `stable-baselines3` library and saving the model + +```python +from stable_baselines3 import PPO + +# initialize the environment + +env = gym.make("LunarLander-v2") + +# initialize the model + +model = PPO(policy = "MlpPolicy", + env = env, + n_steps = 1024, + batch_size = 64, + n_epochs = 4, + verbose = 1) + +# train the model for 1000 time steps +model.learn(total_timesteps = 1000) + +# Saving the model in desired directory +model_name = "PPO-LunarLander-v2" +model.save(model_name) +``` + +Below code shows how to evaluate an agent trained using `stable-baselines3` + +```python +# Loading a saved model and evaluating the model for 10 episodes +from stable_baselines3.common.evaluation import evaluate_policy +from stable_baselines3 import PPO + + +env = gym.make("LunarLander-v2") +# Loading the saved model +model = PPO.load("PPO-LunarLander-v2",env=env) + +# Initializating the evaluation environment +eval_env = gym.make("LunarLander-v2") + +# Running the trained agent on eval_env for 10 time steps and getting the mean reward +mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes = 10, + deterministic=True) + +print(f"mean_reward={mean_reward:.2f} +/- {std_reward}") +``` + +Below code snippet shows how to infer actions from an agent trained using `stable-baselines3` + +```python +from stable_baselines3.common.evaluation import evaluate_policy +from stable_baselines3 import PPO + +# Loading the saved model +model = PPO.load("PPO-LunarLander-v2",env=env) + +# Getting the environment from the trained agent +env = model.get_env() + +obs = env.reset() +for i in range(1000): + # getting action predictions from the trained agent + action, _states = model.predict(obs, deterministic=True) + + # taking the predicted action in the environment to observe next state and rewards + obs, rewards, dones, info = env.step(action) +``` + +For more information, you can check out the documentations of the respective libraries. + +[Gymnasium Documentation](https://gymnasium.farama.org/) +[Stable Baselines Documentation](https://stable-baselines3.readthedocs.io/en/master/) + +## Useful Resources + +Would you like to learn more about the topic? Awesome! Here you can find some curated resources that you may find helpful! + +- [HuggingFace Deep Reinforcement Learning Class](https://github.com/huggingface/deep-rl-class) +- [Introduction to Deep Reinforcement Learning](https://huggingface.co/blog/deep-rl-intro) +- [Stable Baselines Integration with HuggingFace](https://huggingface.co/blog/sb3) +- Learn how reinforcement learning is used in conversational agents in this blog: [Illustrating Reinforcement Learning from Human Feedback (RLHF)](https://huggingface.co/blog/rlhf) +- [Reinforcement Learning from Human Feedback From Zero to ChatGPT](https://www.youtube.com/watch?v=EAd4oQtEJOM) +- [Guide on Multi-Agent Competition Systems](https://huggingface.co/blog/aivsai) + +### Notebooks + +- [Train a Deep Reinforcement Learning lander agent to land correctly on the Moon 🌕 using Stable-Baselines3](https://github.com/huggingface/deep-rl-class/blob/main/notebooks/unit1/unit1.ipynb) +- [Introduction to Unity MLAgents](https://github.com/huggingface/deep-rl-class/blob/main/notebooks/unit5/unit5.ipynb) +- [Training Decision Transformers with 🤗 transformers](https://github.com/huggingface/blog/blob/main/notebooks/101_train-decision-transformers.ipynb) + +This page was made possible thanks to the efforts of [Ram Ananth](https://huggingface.co/RamAnanth1), [Emilio Lehoucq](https://huggingface.co/emiliol), [Sagar Mathpal](https://huggingface.co/sagarmathpal) and [Osman Alenbey](https://huggingface.co/osman93). diff --git a/packages/tasks/src/reinforcement-learning/data.ts b/packages/tasks/src/reinforcement-learning/data.ts new file mode 100644 index 000000000..78731ec20 --- /dev/null +++ b/packages/tasks/src/reinforcement-learning/data.ts @@ -0,0 +1,75 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A curation of widely used datasets for Data Driven Deep Reinforcement Learning (D4RL)", + id: "edbeeching/decision_transformer_gym_replay", + }, + ], + demo: { + inputs: [ + { + label: "State", + content: "Red traffic light, pedestrians are about to pass.", + type: "text", + }, + ], + outputs: [ + { + label: "Action", + content: "Stop the car.", + type: "text", + }, + { + label: "Next State", + content: "Yellow light, pedestrians have crossed.", + type: "text", + }, + ], + }, + metrics: [ + { + description: + "Accumulated reward across all time steps discounted by a factor that ranges between 0 and 1 and determines how much the agent optimizes for future relative to immediate rewards. Measures how good is the policy ultimately found by a given algorithm considering uncertainty over the future.", + id: "Discounted Total Reward", + }, + { + description: + "Average return obtained after running the policy for a certain number of evaluation episodes. As opposed to total reward, mean reward considers how much reward a given algorithm receives while learning.", + id: "Mean Reward", + }, + { + description: + "Measures how good a given algorithm is after a predefined time. Some algorithms may be guaranteed to converge to optimal behavior across many time steps. However, an agent that reaches an acceptable level of optimality after a given time horizon may be preferable to one that ultimately reaches optimality but takes a long time.", + id: "Level of Performance After Some Time", + }, + ], + models: [ + { + description: "A Reinforcement Learning model trained on expert data from the Gym Hopper environment", + + id: "edbeeching/decision-transformer-gym-hopper-expert", + }, + { + description: "A PPO agent playing seals/CartPole-v0 using the stable-baselines3 library and the RL Zoo.", + id: "HumanCompatibleAI/ppo-seals-CartPole-v0", + }, + ], + spaces: [ + { + description: "An application for a cute puppy agent learning to catch a stick.", + id: "ThomasSimonini/Huggy", + }, + { + description: "An application to play Snowball Fight with a reinforcement learning agent.", + id: "ThomasSimonini/SnowballFight", + }, + ], + summary: + "Reinforcement learning is the computational approach of learning from action by interacting with an environment through trial and error and receiving rewards (negative or positive) as feedback", + widgetModels: [], + youtubeId: "q0BiUn5LiBc", +}; + +export default taskData; diff --git a/packages/tasks/src/sentence-similarity/about.md b/packages/tasks/src/sentence-similarity/about.md new file mode 100644 index 000000000..ee536235d --- /dev/null +++ b/packages/tasks/src/sentence-similarity/about.md @@ -0,0 +1,97 @@ +## Use Cases 🔍 + +### Information Retrieval + +You can extract information from documents using Sentence Similarity models. The first step is to rank documents using Passage Ranking models. You can then get to the top ranked document and search it with Sentence Similarity models by selecting the sentence that has the most similarity to the input query. + +## The Sentence Transformers library + +The [Sentence Transformers](https://www.sbert.net/) library is very powerful for calculating embeddings of sentences, paragraphs, and entire documents. An embedding is just a vector representation of a text and is useful for finding how similar two texts are. + +You can find and use [hundreds of Sentence Transformers](https://huggingface.co/models?library=sentence-transformers&sort=downloads) models from the Hub by directly using the library, playing with the widgets in the browser or using the Inference API. + +## Task Variants + +### Passage Ranking + +Passage Ranking is the task of ranking documents based on their relevance to a given query. The task is evaluated on Mean Reciprocal Rank. These models take one query and multiple documents and return ranked documents according to the relevancy to the query. 📄 + +You can infer with Passage Ranking models using the [Inference API](https://huggingface.co/inference-api). The Passage Ranking model inputs are a query for which we look for relevancy in the documents and the documents we want to search. The model will return scores according to the relevancy of these documents for the query. + +```python +import json +import requests + +API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b" +headers = {"Authorization": f"Bearer {api_token}"} + +def query(payload): + response = requests.post(API_URL, headers=headers, json=payload) + return response.json() + +data = query( + { + "inputs": { + "source_sentence": "That is a happy person", + "sentences": [ + "That is a happy dog", + "That is a very happy person", + "Today is a sunny day" + ] + } + } +## [0.853, 0.981, 0.655] +``` + +### Semantic Textual Similarity + +Semantic Textual Similarity is the task of evaluating how similar two texts are in terms of meaning. These models take a source sentence and a list of sentences in which we will look for similarities and will return a list of similarity scores. The benchmark dataset is the [Semantic Textual Similarity Benchmark](http://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark). The task is evaluated on Pearson’s Rank Correlation. + +```python +import json +import requests + +API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2" +headers = {"Authorization": f"Bearer {api_token}"} + +def query(payload): + response = requests.post(API_URL, headers=headers, json=payload) + return response.json() + +data = query( + { + "inputs": { + "source_sentence": "I'm very happy", + "sentences":["I'm filled with happiness", "I'm happy"] + } + }) + +## [0.605, 0.894] +``` + +You can also infer with the models in the Hub using Sentence Transformer models. + +```python +pip install -U sentence-transformers + +from sentence_transformers import SentenceTransformer, util +sentences = ["I'm happy", "I'm full of happiness"] + +model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') + +#Compute embedding for both lists +embedding_1= model.encode(sentences[0], convert_to_tensor=True) +embedding_2 = model.encode(sentences[1], convert_to_tensor=True) + +util.pytorch_cos_sim(embedding_1, embedding_2) +## tensor([[0.6003]]) +``` + +## Useful Resources + +Would you like to learn more about Sentence Transformers and Sentence Similarity? Awesome! Here you can find some curated resources that you may find helpful! + +- [Sentence Transformers Documentation](https://www.sbert.net/) +- [Sentence Transformers in the Hub](https://huggingface.co/blog/sentence-transformers-in-the-hub) +- [Building a Playlist Generator with Sentence Transformers](https://huggingface.co/blog/playlist-generator) +- [Getting Started With Embeddings](https://huggingface.co/blog/getting-started-with-embeddings) diff --git a/packages/tasks/src/sentence-similarity/data.ts b/packages/tasks/src/sentence-similarity/data.ts new file mode 100644 index 000000000..0f71b3946 --- /dev/null +++ b/packages/tasks/src/sentence-similarity/data.ts @@ -0,0 +1,101 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "Bing queries with relevant passages from various web sources.", + id: "ms_marco", + }, + ], + demo: { + inputs: [ + { + label: "Source sentence", + content: "Machine learning is so easy.", + type: "text", + }, + { + label: "Sentences to compare to", + content: "Deep learning is so straightforward.", + type: "text", + }, + { + label: "", + content: "This is so difficult, like rocket science.", + type: "text", + }, + { + label: "", + content: "I can't believe how much I struggled with this.", + type: "text", + }, + ], + outputs: [ + { + type: "chart", + data: [ + { + label: "Deep learning is so straightforward.", + score: 0.623, + }, + { + label: "This is so difficult, like rocket science.", + score: 0.413, + }, + { + label: "I can't believe how much I struggled with this.", + score: 0.256, + }, + ], + }, + ], + }, + metrics: [ + { + description: + "Reciprocal Rank is a measure used to rank the relevancy of documents given a set of documents. Reciprocal Rank is the reciprocal of the rank of the document retrieved, meaning, if the rank is 3, the Reciprocal Rank is 0.33. If the rank is 1, the Reciprocal Rank is 1", + id: "Mean Reciprocal Rank", + }, + { + description: + "The similarity of the embeddings is evaluated mainly on cosine similarity. It is calculated as the cosine of the angle between two vectors. It is particularly useful when your texts are not the same length", + id: "Cosine Similarity", + }, + ], + models: [ + { + description: + "This model works well for sentences and paragraphs and can be used for clustering/grouping and semantic searches.", + id: "sentence-transformers/all-mpnet-base-v2", + }, + { + description: "A multilingual model trained for FAQ retrieval.", + id: "clips/mfaq", + }, + ], + spaces: [ + { + description: "An application that leverages sentence similarity to answer questions from YouTube videos.", + id: "Gradio-Blocks/Ask_Questions_To_YouTube_Videos", + }, + { + description: + "An application that retrieves relevant PubMed abstracts for a given online article which can be used as further references.", + id: "Gradio-Blocks/pubmed-abstract-retriever", + }, + { + description: "An application that leverages sentence similarity to summarize text.", + id: "nickmuchi/article-text-summarizer", + }, + { + description: "A guide that explains how Sentence Transformers can be used for semantic search.", + id: "sentence-transformers/Sentence_Transformers_for_semantic_search", + }, + ], + summary: + "Sentence Similarity is the task of determining how similar two texts are. Sentence similarity models convert input texts into vectors (embeddings) that capture semantic information and calculate how close (similar) they are between them. This task is particularly useful for information retrieval and clustering/grouping.", + widgetModels: ["sentence-transformers/all-MiniLM-L6-v2"], + youtubeId: "VCZq5AkbNEU", +}; + +export default taskData; diff --git a/packages/tasks/src/summarization/about.md b/packages/tasks/src/summarization/about.md new file mode 100644 index 000000000..ec82c946f --- /dev/null +++ b/packages/tasks/src/summarization/about.md @@ -0,0 +1,58 @@ +## Use Cases + +### Research Paper Summarization 🧐 + +Research papers can be summarized to allow researchers to spend less time selecting which articles to read. There are several approaches you can take for a task like this: + +1. Use an existing extractive summarization model on the Hub to do inference. +2. Pick an existing language model trained for academic papers. This model can then be trained in a process called fine-tuning so it can solve the summarization task. +3. Use a sequence-to-sequence model like [T5](https://huggingface.co/docs/transformers/model_doc/t5) for abstractive text summarization. + +## Inference + +You can use the 🤗 Transformers library `summarization` pipeline to infer with existing Summarization models. If no model name is provided the pipeline will be initialized with [sshleifer/distilbart-cnn-12-6](https://huggingface.co/sshleifer/distilbart-cnn-12-6). + +```python +from transformers import pipeline + +classifier = pipeline("summarization") +classifier("Paris is the capital and most populous city of France, with an estimated population of 2,175,601 residents as of 2018, in an area of more than 105 square kilometres (41 square miles). The City of Paris is the centre and seat of government of the region and province of Île-de-France, or Paris Region, which has an estimated population of 12,174,880, or about 18 percent of the population of France as of 2017.") +## [{ "summary_text": " Paris is the capital and most populous city of France..." }] +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer summarization models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +const inputs = + "Paris is the capital and most populous city of France, with an estimated population of 2,175,601 residents as of 2018, in an area of more than 105 square kilometres (41 square miles). The City of Paris is the centre and seat of government of the region and province of Île-de-France, or Paris Region, which has an estimated population of 12,174,880, or about 18 percent of the population of France as of 2017."; + +await inference.summarization({ + model: "sshleifer/distilbart-cnn-12-6", + inputs, +}); +``` + +## Useful Resources + +Would you like to learn more about the topic? Awesome! Here you can find some curated resources that you may find helpful! + +- [Course Chapter on Summarization](https://huggingface.co/course/chapter7/5?fw=pt) +- [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq) + +### Notebooks + +- [PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/summarization.ipynb) +- [TensorFlow](https://github.com/huggingface/notebooks/blob/master/examples/summarization-tf.ipynb) + +### Scripts for training + +- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) +- [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) +- [Flax](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization) + +### Documentation + +- [Summarization task guide](https://huggingface.co/docs/transformers/tasks/summarization) diff --git a/packages/tasks/src/summarization/data.ts b/packages/tasks/src/summarization/data.ts new file mode 100644 index 000000000..d0afc8528 --- /dev/null +++ b/packages/tasks/src/summarization/data.ts @@ -0,0 +1,75 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: + "News articles in five different languages along with their summaries. Widely used for benchmarking multilingual summarization models.", + id: "mlsum", + }, + { + description: "English conversations and their summaries. Useful for benchmarking conversational agents.", + id: "samsum", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: + "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. It was the first structure to reach a height of 300 metres. Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.", + type: "text", + }, + ], + outputs: [ + { + label: "Output", + content: + "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building. It was the first structure to reach a height of 300 metres.", + type: "text", + }, + ], + }, + metrics: [ + { + description: + "The generated sequence is compared against its summary, and the overlap of tokens are counted. ROUGE-N refers to overlap of N subsequent tokens, ROUGE-1 refers to overlap of single tokens and ROUGE-2 is the overlap of two subsequent tokens.", + id: "rouge", + }, + ], + models: [ + { + description: + "A strong summarization model trained on English news articles. Excels at generating factual summaries.", + id: "facebook/bart-large-cnn", + }, + { + description: "A summarization model trained on medical articles.", + id: "google/bigbird-pegasus-large-pubmed", + }, + ], + spaces: [ + { + description: "An application that can summarize long paragraphs.", + id: "pszemraj/summarize-long-text", + }, + { + description: "A much needed summarization application for terms and conditions.", + id: "ml6team/distilbart-tos-summarizer-tosdr", + }, + { + description: "An application that summarizes long documents.", + id: "pszemraj/document-summarization", + }, + { + description: "An application that can detect errors in abstractive summarization.", + id: "ml6team/post-processing-summarization", + }, + ], + summary: + "Summarization is the task of producing a shorter version of a document while preserving its important information. Some models can extract text from the original input, while other models can generate entirely new text.", + widgetModels: ["sshleifer/distilbart-cnn-12-6"], + youtubeId: "yHnr5Dk2zCI", +}; + +export default taskData; diff --git a/packages/tasks/src/table-question-answering/about.md b/packages/tasks/src/table-question-answering/about.md new file mode 100644 index 000000000..684c85c22 --- /dev/null +++ b/packages/tasks/src/table-question-answering/about.md @@ -0,0 +1,43 @@ +## Use Cases + +### SQL execution + +You can use the Table Question Answering models to simulate SQL execution by inputting a table. + +### Table Question Answering + +Table Question Answering models are capable of answering questions based on a table. + +## Task Variants + +This place can be filled with variants of this task if there's any. + +## Inference + +You can infer with TableQA models using the 🤗 Transformers library. + +```python +from transformers import pipeline +import pandas as pd + +# prepare table + question +data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +table = pd.DataFrame.from_dict(data) +question = "how many movies does Leonardo Di Caprio have?" + +# pipeline model +# Note: you must to install torch-scatter first. +tqa = pipeline(task="table-question-answering", model="google/tapas-large-finetuned-wtq") + +# result + +print(tqa(table=table, query=query)['cells'][0]) +#53 + +``` + +## Useful Resources + +In this area, you can insert useful resources about how to train or use a model for this task. + +This task page is complete thanks to the efforts of [Hao Kim Tieu](https://huggingface.co/haotieu). 🦸 diff --git a/packages/tasks/src/table-question-answering/data.ts b/packages/tasks/src/table-question-answering/data.ts new file mode 100644 index 000000000..6ad9fd0af --- /dev/null +++ b/packages/tasks/src/table-question-answering/data.ts @@ -0,0 +1,59 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: + "The WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.", + id: "wikitablequestions", + }, + { + description: + "WikiSQL is a dataset of 80654 hand-annotated examples of questions and SQL queries distributed across 24241 tables from Wikipedia.", + id: "wikisql", + }, + ], + demo: { + inputs: [ + { + table: [ + ["Rank", "Name", "No.of reigns", "Combined days"], + ["1", "lou Thesz", "3", "3749"], + ["2", "Ric Flair", "8", "3103"], + ["3", "Harley Race", "7", "1799"], + ], + type: "tabular", + }, + + { label: "Question", content: "What is the number of reigns for Harley Race?", type: "text" }, + ], + outputs: [{ label: "Result", content: "7", type: "text" }], + }, + metrics: [ + { + description: "Checks whether the predicted answer(s) is the same as the ground-truth answer(s).", + id: "Denotation Accuracy", + }, + ], + models: [ + { + description: + "A table question answering model that is capable of neural SQL execution, i.e., employ TAPEX to execute a SQL query on a given table.", + id: "microsoft/tapex-base", + }, + { + description: "A robust table question answering model.", + id: "google/tapas-base-finetuned-wtq", + }, + ], + spaces: [ + { + description: "An application that answers questions based on table CSV files.", + id: "katanaml/table-query", + }, + ], + summary: "Table Question Answering (Table QA) is the answering a question about an information on a given table.", + widgetModels: ["google/tapas-base-finetuned-wtq"], +}; + +export default taskData; diff --git a/packages/tasks/src/tabular-classification/about.md b/packages/tasks/src/tabular-classification/about.md new file mode 100644 index 000000000..9af38bcee --- /dev/null +++ b/packages/tasks/src/tabular-classification/about.md @@ -0,0 +1,65 @@ +## About the Task + +Tabular classification is the task of assigning a label or class given a limited number of attributes. For example, the input can be data related to a customer (balance of the customer, the time being a customer, or more) and the output can be whether the customer will churn from the service or not. +There are three types of categorical variables: + +- Binary variables: Variables that can take two values, like yes or no, open or closed. The task of predicting binary variables is called binary classification. +- Ordinal variables: Variables with a ranking relationship, e.g., good, insignificant, and bad product reviews. The task of predicting ordinal variables is called ordinal classification. +- Nominal variables: Variables with no ranking relationship among them, e.g., predicting an animal from their weight and height, where categories are cat, dog, or bird. The task of predicting nominal variables is called multinomial classification. + +## Use Cases + +### Fraud Detection +Tabular classification models can be used in detecting fraudulent credit card transactions, where the features could be the amount of the transaction and the account balance, and the target to predict could be whether the transaction is fraudulent or not. This is an example of binary classification. + +### Churn Prediction +Tabular classification models can be used in predicting customer churn in telecommunication. An example dataset for the task is hosted [here](https://huggingface.co/datasets/scikit-learn/churn-prediction). + +# Model Hosting and Inference + +You can use [skops](https://skops.readthedocs.io/) for model hosting and inference on the Hugging Face Hub. This library is built to improve production workflows of various libraries that are used to train tabular models, including [sklearn](https://scikit-learn.org/stable/) and [xgboost](https://xgboost.readthedocs.io/en/stable/). Using `skops` you can: + +- Easily use inference API, +- Build neat UIs with one line of code, +- Programmatically create model cards, +- Securely serialize your scikit-learn model. (See limitations of using pickle [here](https://huggingface.co/docs/hub/security-pickle).) + +You can push your model as follows: + +```python +from skops import hub_utils +# initialize a repository with a trained model +local_repo = "/path_to_new_repo" +hub_utils.init(model, dst=local_repo) +# push to Hub! +hub_utils.push("username/my-awesome-model", source=local_repo) +``` + +Once the model is pushed, you can infer easily. + +```python +import skops.hub_utils as hub_utils +import pandas as pd +data = pd.DataFrame(your_data) +# Load the model from the Hub +res = hub_utils.get_model_output("username/my-awesome-model", data) +``` + +You can launch a UI for your model with only one line of code! + +```python +import gradio as gr +gr.Interface.load("huggingface/username/my-awesome-model").launch() +``` + +## Useful Resources + +- Check out the [scikit-learn organization](https://huggingface.co/scikit-learn) to learn more about different algorithms used for this task. +- [Skops documentation](https://skops.readthedocs.io/en/latest/) +- [Skops announcement blog](https://huggingface.co/blog/skops) +- [Notebook: Persisting your scikit-learn model using skops](https://www.kaggle.com/code/unofficialmerve/persisting-your-scikit-learn-model-using-skops) +- Check out [interactive sklearn examples](https://huggingface.co/sklearn-docs) built with ❤️ using Gradio. + +### Training your own model in just a few seconds + +We have built a [baseline trainer](https://huggingface.co/spaces/scikit-learn/baseline-trainer) application to which you can drag and drop your dataset. It will train a baseline and push it to your Hugging Face Hub profile with a model card containing information about the model. diff --git a/packages/tasks/src/tabular-classification/data.ts b/packages/tasks/src/tabular-classification/data.ts new file mode 100644 index 000000000..5cc3f4b9a --- /dev/null +++ b/packages/tasks/src/tabular-classification/data.ts @@ -0,0 +1,68 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A comprehensive curation of datasets covering all benchmarks.", + id: "inria-soda/tabular-benchmark", + }, + ], + demo: { + inputs: [ + { + table: [ + ["Glucose", "Blood Pressure ", "Skin Thickness", "Insulin", "BMI"], + ["148", "72", "35", "0", "33.6"], + ["150", "50", "30", "0", "35.1"], + ["141", "60", "29", "1", "39.2"], + ], + type: "tabular", + }, + ], + outputs: [ + { + table: [["Diabetes"], ["1"], ["1"], ["0"]], + type: "tabular", + }, + ], + }, + metrics: [ + { + description: "", + id: "accuracy", + }, + { + description: "", + id: "recall", + }, + { + description: "", + id: "precision", + }, + { + description: "", + id: "f1", + }, + ], + models: [ + { + description: "Breast cancer prediction model based on decision trees.", + id: "scikit-learn/cancer-prediction-trees", + }, + ], + spaces: [ + { + description: "An application that can predict defective products on a production line.", + id: "scikit-learn/tabular-playground", + }, + { + description: "An application that compares various tabular classification techniques on different datasets.", + id: "scikit-learn/classification", + }, + ], + summary: "Tabular classification is the task of classifying a target category (a group) based on set of attributes.", + widgetModels: ["scikit-learn/tabular-playground"], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/tabular-regression/about.md b/packages/tasks/src/tabular-regression/about.md new file mode 100644 index 000000000..eb8861779 --- /dev/null +++ b/packages/tasks/src/tabular-regression/about.md @@ -0,0 +1,87 @@ +## About the Task + +Tabular regression is the task of predicting a numerical value given a set of attributes/features. _Tabular_ meaning that data is stored in a table (like an excel sheet), and each sample is contained in its own row. The features used to predict our target can be both numerical and categorical. However, including categorical features often requires additional preprocessing/feature engineering (a few models do accept categorical features directly, like [CatBoost](https://catboost.ai/)). An example of tabular regression would be predicting the weight of a fish given its' species and length. + +## Use Cases + +### Sales Prediction: a Use Case for Predicting a Continuous Target Variable + +Here the objective is to predict a continuous variable based on a set of input variable(s). For example, predicting `sales` of an ice cream shop based on `temperature` of weather and `duration of hours` shop was open. Here we can build a regression model with `temperature` and `duration of hours` as input variable and `sales` as target variable. + +### Missing Value Imputation for Other Tabular Tasks +In real-world applications, due to human error or other reasons, some of the input values can be missing or there might not be any recorded data. Considering the example above, say the shopkeeper's watch was broken and they forgot to calculate the `hours` for which the shop was open. This will lead to a missing value in their dataset. In this case, missing values could be replaced it with zero, or average hours for which the shop is kept open. Another approach we can try is to use `temperature` and `sales` variables to predict the `hours` variable here. + +## Model Training + +A simple regression model can be created using `sklearn` as follows: + +```python +#set the input features +X = data[["Feature 1", "Feature 2", "Feature 3"]] +#set the target variable +y = data["Target Variable"] +#initialize the model +model = LinearRegression() +#Fit the model +model.fit(X, y) +``` + +# Model Hosting and Inference + +You can use [skops](https://skops.readthedocs.io/) for model hosting and inference on the Hugging Face Hub. This library is built to improve production workflows of various libraries that are used to train tabular models, including [sklearn](https://scikit-learn.org/stable/) and [xgboost](https://xgboost.readthedocs.io/en/stable/). Using `skops` you can: + +- Easily use inference API, +- Build neat UIs with one line of code, +- Programmatically create model cards, +- Securely serialize your models. (See limitations of using pickle [here](https://huggingface.co/docs/hub/security-pickle).) + +You can push your model as follows: + +```python +from skops import hub_utils +# initialize a repository with a trained model +local_repo = "/path_to_new_repo" +hub_utils.init(model, dst=local_repo) +# push to Hub! +hub_utils.push("username/my-awesome-model", source=local_repo) +``` + +Once the model is pushed, you can infer easily. + +```python +import skops.hub_utils as hub_utils +import pandas as pd +data = pd.DataFrame(your_data) +# Load the model from the Hub +res = hub_utils.get_model_output("username/my-awesome-model", data) +``` + +You can launch a UI for your model with only one line of code! + +```python +import gradio as gr +gr.Interface.load("huggingface/username/my-awesome-model").launch() +``` + +## Useful Resources + +- [Skops documentation](https://skops.readthedocs.io/en/stable/index.html) + +- Check out [interactive sklearn examples](https://huggingface.co/sklearn-docs) built with ❤️ using Gradio. +- [Notebook: Persisting your scikit-learn model using skops](https://www.kaggle.com/code/unofficialmerve/persisting-your-scikit-learn-model-using-skops) + +- For starting with tabular regression: + + - Doing [Exploratory Data Analysis](https://neptune.ai/blog/exploratory-data-analysis-for-tabular-data) for tabular data. + - The data considered here consists of details of Olympic athletes and medal results from Athens 1896 to Rio 2016. + - Here you can learn more about how to explore and analyse the data and visualize them in order to get a better understanding of dataset. + - Building your [first ML model](https://www.kaggle.com/code/dansbecker/your-first-machine-learning-model). + +- Intermediate level tutorials on tabular regression: + - [A Short Chronology of Deep Learning for Tabular Data](https://sebastianraschka.com/blog/2022/deep-learning-for-tabular-data.html) by Sebastian Raschka. + +### Training your own model in just a few seconds + +We have built a [baseline trainer](https://huggingface.co/spaces/scikit-learn/baseline-trainer) application to which you can drag and drop your dataset. It will train a baseline and push it to your Hugging Face Hub profile with a model card containing information about the model. + +This page was made possible thanks to efforts of [Brenden Connors](https://huggingface.co/brendenc) and [Ayush Bihani](https://huggingface.co/hsuyab). diff --git a/packages/tasks/src/tabular-regression/data.ts b/packages/tasks/src/tabular-regression/data.ts new file mode 100644 index 000000000..c5b787538 --- /dev/null +++ b/packages/tasks/src/tabular-regression/data.ts @@ -0,0 +1,57 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A comprehensive curation of datasets covering all benchmarks.", + id: "inria-soda/tabular-benchmark", + }, + ], + demo: { + inputs: [ + { + table: [ + ["Car Name", "Horsepower", "Weight"], + ["ford torino", "140", "3,449"], + ["amc hornet", "97", "2,774"], + ["toyota corolla", "65", "1,773"], + ], + type: "tabular", + }, + ], + outputs: [ + { + table: [["MPG (miles per gallon)"], ["17"], ["18"], ["31"]], + type: "tabular", + }, + ], + }, + metrics: [ + { + description: "", + id: "mse", + }, + { + description: + "Coefficient of determination (or R-squared) is a measure of how well the model fits the data. Higher R-squared is considered a better fit.", + id: "r-squared", + }, + ], + models: [ + { + description: "Fish weight prediction based on length measurements and species.", + id: "scikit-learn/Fish-Weight", + }, + ], + spaces: [ + { + description: "An application that can predict weight of a fish based on set of attributes.", + id: "scikit-learn/fish-weight-prediction", + }, + ], + summary: "Tabular regression is the task of predicting a numerical value given a set of attributes.", + widgetModels: ["scikit-learn/Fish-Weight"], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/tasksData.ts b/packages/tasks/src/tasksData.ts new file mode 100644 index 000000000..db2609971 --- /dev/null +++ b/packages/tasks/src/tasksData.ts @@ -0,0 +1,101 @@ +import { type PipelineType, PIPELINE_DATA } from "./pipelines"; +import type { TaskDataCustom, TaskData } from "./Types"; + +import audioClassification from "./audio-classification/data"; +import audioToAudio from "./audio-to-audio/data"; +import automaticSpeechRecognition from "./automatic-speech-recognition/data"; +import conversational from "./conversational/data"; +import documentQuestionAnswering from "./document-question-answering/data"; +import featureExtraction from "./feature-extraction/data"; +import fillMask from "./fill-mask/data"; +import imageClassification from "./image-classification/data"; +import imageToImage from "./image-to-image/data"; +import imageToText from "./image-to-text/data"; +import imageSegmentation from "./image-segmentation/data"; +import objectDetection from "./object-detection/data"; +import depthEstimation from "./depth-estimation/data"; +import placeholder from "./placeholder/data"; +import reinforcementLearning from "./reinforcement-learning/data"; +import questionAnswering from "./question-answering/data"; +import sentenceSimilarity from "./sentence-similarity/data"; +import summarization from "./summarization/data"; +import tableQuestionAnswering from "./table-question-answering/data"; +import tabularClassification from "./tabular-classification/data"; +import tabularRegression from "./tabular-regression/data"; +import textToImage from "./text-to-image/data"; +import textToSpeech from "./text-to-speech/data"; +import tokenClassification from "./token-classification/data"; +import translation from "./translation/data"; +import textClassification from "./text-classification/data"; +import textGeneration from "./text-generation/data"; +import textToVideo from "./text-to-video/data"; +import unconditionalImageGeneration from "./unconditional-image-generation/data"; +import videoClassification from "./video-classification/data"; +import visualQuestionAnswering from "./visual-question-answering/data"; +import zeroShotClassification from "./zero-shot-classification/data"; +import zeroShotImageClassification from "./zero-shot-image-classification/data"; +import { TASKS_MODEL_LIBRARIES } from "./const"; + +// To make comparisons easier, task order is the same as in const.ts +// Tasks set to undefined won't have an associated task page. +// Tasks that call getData() without the second argument will +// have a "placeholder" page. +export const TASKS_DATA: Record = { + "audio-classification": getData("audio-classification", audioClassification), + "audio-to-audio": getData("audio-to-audio", audioToAudio), + "automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition), + conversational: getData("conversational", conversational), + "depth-estimation": getData("depth-estimation", depthEstimation), + "document-question-answering": getData("document-question-answering", documentQuestionAnswering), + "feature-extraction": getData("feature-extraction", featureExtraction), + "fill-mask": getData("fill-mask", fillMask), + "graph-ml": undefined, + "image-classification": getData("image-classification", imageClassification), + "image-segmentation": getData("image-segmentation", imageSegmentation), + "image-to-image": getData("image-to-image", imageToImage), + "image-to-text": getData("image-to-text", imageToText), + "multiple-choice": undefined, + "object-detection": getData("object-detection", objectDetection), + "video-classification": getData("video-classification", videoClassification), + other: undefined, + "question-answering": getData("question-answering", questionAnswering), + "reinforcement-learning": getData("reinforcement-learning", reinforcementLearning), + robotics: undefined, + "sentence-similarity": getData("sentence-similarity", sentenceSimilarity), + summarization: getData("summarization", summarization), + "table-question-answering": getData("table-question-answering", tableQuestionAnswering), + "table-to-text": undefined, + "tabular-classification": getData("tabular-classification", tabularClassification), + "tabular-regression": getData("tabular-regression", tabularRegression), + "tabular-to-text": undefined, + "text-classification": getData("text-classification", textClassification), + "text-generation": getData("text-generation", textGeneration), + "text-retrieval": undefined, + "text-to-image": getData("text-to-image", textToImage), + "text-to-speech": getData("text-to-speech", textToSpeech), + "text-to-audio": undefined, + "text-to-video": getData("text-to-video", textToVideo), + "text2text-generation": undefined, + "time-series-forecasting": undefined, + "token-classification": getData("token-classification", tokenClassification), + translation: getData("translation", translation), + "unconditional-image-generation": getData("unconditional-image-generation", unconditionalImageGeneration), + "visual-question-answering": getData("visual-question-answering", visualQuestionAnswering), + "voice-activity-detection": undefined, + "zero-shot-classification": getData("zero-shot-classification", zeroShotClassification), + "zero-shot-image-classification": getData("zero-shot-image-classification", zeroShotImageClassification), +} as const; + +/** + * Return the whole TaskData object for a certain task. + * If the partialTaskData argument is left undefined, + * the default placholder data will be used. + */ +function getData(type: PipelineType, partialTaskData: TaskDataCustom = placeholder): TaskData { + return { + ...partialTaskData, + id: type, + label: PIPELINE_DATA[type].name, + libraries: TASKS_MODEL_LIBRARIES[type], + }; +} diff --git a/packages/tasks/src/text-classification/about.md b/packages/tasks/src/text-classification/about.md new file mode 100644 index 000000000..448eb7138 --- /dev/null +++ b/packages/tasks/src/text-classification/about.md @@ -0,0 +1,172 @@ +## Use Cases + +### Sentiment Analysis on Customer Reviews + +You can track the sentiments of your customers from the product reviews using sentiment analysis models. This can help understand churn and retention by grouping reviews by sentiment, to later analyze the text and make strategic decisions based on this knowledge. + +## Task Variants + +### Natural Language Inference (NLI) + +In NLI the model determines the relationship between two given texts. Concretely, the model takes a premise and a hypothesis and returns a class that can either be: + +- **entailment**, which means the hypothesis is true. +- **contraction**, which means the hypothesis is false. +- **neutral**, which means there's no relation between the hypothesis and the premise. + +The benchmark dataset for this task is GLUE (General Language Understanding Evaluation). NLI models have different variants, such as Multi-Genre NLI, Question NLI and Winograd NLI. + +### Multi-Genre NLI (MNLI) + +MNLI is used for general NLI. Here are som examples: + +``` +Example 1: + Premise: A man inspects the uniform of a figure in some East Asian country. + Hypothesis: The man is sleeping. + Label: Contradiction + +Example 2: + Premise: Soccer game with multiple males playing. + Hypothesis: Some men are playing a sport. + Label: Entailment +``` + +#### Inference + +You can use the 🤗 Transformers library `text-classification` pipeline to infer with NLI models. + +```python +from transformers import pipeline + +classifier = pipeline("text-classification", model = "roberta-large-mnli") +classifier("A soccer game with multiple males playing. Some men are playing a sport.") +## [{'label': 'ENTAILMENT', 'score': 0.98}] +``` + +### Question Natural Language Inference (QNLI) + +QNLI is the task of determining if the answer to a certain question can be found in a given document. If the answer can be found the label is “entailment”. If the answer cannot be found the label is “not entailment". + +``` +Question: What percentage of marine life died during the extinction? +Sentence: It is also known as the “Great Dying” because it is considered the largest mass extinction in the Earth’s history. +Label: not entailment + +Question: Who was the London Weekend Television’s Managing Director? +Sentence: The managing director of London Weekend Television (LWT), Greg Dyke, met with the representatives of the "big five" football clubs in England in 1990. +Label: entailment +``` + +#### Inference + +You can use the 🤗 Transformers library `text-classification` pipeline to infer with QNLI models. The model returns the label and the confidence. + +```python +from transformers import pipeline + +classifier = pipeline("text-classification", model = "cross-encoder/qnli-electra-base") +classifier("Where is the capital of France?, Paris is the capital of France.") +## [{'label': 'entailment', 'score': 0.997}] +``` + +### Sentiment Analysis + +In Sentiment Analysis, the classes can be polarities like positive, negative, neutral, or sentiments such as happiness or anger. + +#### Inference + +You can use the 🤗 Transformers library with the `sentiment-analysis` pipeline to infer with Sentiment Analysis models. The model returns the label with the score. + +```python +from transformers import pipeline + +classifier = pipeline("sentiment-analysis") +classifier("I loved Star Wars so much!") +## [{'label': 'POSITIVE', 'score': 0.99} +``` + +### Quora Question Pairs + +Quora Question Pairs models assess whether two provided questions are paraphrases of each other. The model takes two questions and returns a binary value, with 0 being mapped to “not paraphrase” and 1 to “paraphrase". The benchmark dataset is [Quora Question Pairs](https://huggingface.co/datasets/glue/viewer/qqp/test) inside the [GLUE benchmark](https://huggingface.co/datasets/glue). The dataset consists of question pairs and their labels. + +``` +Question1: “How can I increase the speed of my internet connection while using a VPN?” +Question2: How can Internet speed be increased by hacking through DNS? +Label: Not paraphrase + +Question1: “What can make Physics easy to learn?” +Question2: “How can you make physics easy to learn?” +Label: Paraphrase +``` + +#### Inference + +You can use the 🤗 Transformers library `text-classification` pipeline to infer with QQPI models. + +```python +from transformers import pipeline + +classifier = pipeline("text-classification", model = "textattack/bert-base-uncased-QQP") +classifier("Which city is the capital of France?, Where is the capital of France?") +## [{'label': 'paraphrase', 'score': 0.998}] +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer text classification models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.conversational({ + model: "distilbert-base-uncased-finetuned-sst-2-english", + inputs: "I love this movie!", +}); +``` + +### Grammatical Correctness + +Linguistic Acceptability is the task of assessing the grammatical acceptability of a sentence. The classes in this task are “acceptable” and “unacceptable”. The benchmark dataset used for this task is [Corpus of Linguistic Acceptability (CoLA)](https://huggingface.co/datasets/glue/viewer/cola/test). The dataset consists of texts and their labels. + +``` +Example: Books were sent to each other by the students. +Label: Unacceptable + +Example: She voted for herself. +Label: Acceptable. +``` + +#### Inference + +```python +from transformers import pipeline + +classifier = pipeline("text-classification", model = "textattack/distilbert-base-uncased-CoLA") +classifier("I will walk to home when I went through the bus.") +## [{'label': 'unacceptable', 'score': 0.95}] +``` + +## Useful Resources + +Would you like to learn more about the topic? Awesome! Here you can find some curated resources that you may find helpful! + +- [Course Chapter on Fine-tuning a Text Classification Model](https://huggingface.co/course/chapter3/1?fw=pt) +- [Getting Started with Sentiment Analysis using Python](https://huggingface.co/blog/sentiment-analysis-python) +- [Sentiment Analysis on Encrypted Data with Homomorphic Encryption](https://huggingface.co/blog/sentiment-analysis-fhe) +- [Leveraging Hugging Face for complex text classification use cases](https://huggingface.co/blog/classification-use-cases) + +### Notebooks + +- [PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb) +- [TensorFlow](https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb) +- [Flax](https://github.com/huggingface/notebooks/blob/master/examples/text_classification_flax.ipynb) + +### Scripts for training + +- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) +- [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) +- [Flax](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) + +### Documentation + +- [Text classification task guide](https://huggingface.co/docs/transformers/tasks/sequence_classification) diff --git a/packages/tasks/src/text-classification/data.ts b/packages/tasks/src/text-classification/data.ts new file mode 100644 index 000000000..7893753ca --- /dev/null +++ b/packages/tasks/src/text-classification/data.ts @@ -0,0 +1,91 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A widely used dataset used to benchmark multiple variants of text classification.", + id: "glue", + }, + { + description: "A text classification dataset used to benchmark natural language inference models", + id: "snli", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "I love Hugging Face!", + type: "text", + }, + ], + outputs: [ + { + type: "chart", + data: [ + { + label: "POSITIVE", + score: 0.9, + }, + { + label: "NEUTRAL", + score: 0.1, + }, + { + label: "NEGATIVE", + score: 0.0, + }, + ], + }, + ], + }, + metrics: [ + { + description: "", + id: "accuracy", + }, + { + description: "", + id: "recall", + }, + { + description: "", + id: "precision", + }, + { + description: + "The F1 metric is the harmonic mean of the precision and recall. It can be calculated as: F1 = 2 * (precision * recall) / (precision + recall)", + id: "f1", + }, + ], + models: [ + { + description: "A robust model trained for sentiment analysis.", + id: "distilbert-base-uncased-finetuned-sst-2-english", + }, + { + description: "Multi-genre natural language inference model.", + id: "roberta-large-mnli", + }, + ], + spaces: [ + { + description: "An application that can classify financial sentiment.", + id: "IoannisTr/Tech_Stocks_Trading_Assistant", + }, + { + description: "A dashboard that contains various text classification tasks.", + id: "miesnerjacob/Multi-task-NLP", + }, + { + description: "An application that analyzes user reviews in healthcare.", + id: "spacy/healthsea-demo", + }, + ], + summary: + "Text Classification is the task of assigning a label or class to a given text. Some use cases are sentiment analysis, natural language inference, and assessing grammatical correctness.", + widgetModels: ["distilbert-base-uncased-finetuned-sst-2-english"], + youtubeId: "leNG9fN9FQU", +}; + +export default taskData; diff --git a/packages/tasks/src/text-generation/about.md b/packages/tasks/src/text-generation/about.md new file mode 100644 index 000000000..c8ed8120f --- /dev/null +++ b/packages/tasks/src/text-generation/about.md @@ -0,0 +1,129 @@ +This task covers guides on both [text-generation](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads) and [text-to-text generation](https://huggingface.co/models?pipeline_tag=text2text-generation&sort=downloads) models. Popular large language models that are used for chats or following instructions are also covered in this task. You can find the list of selected open-source large language models [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), ranked by their performance scores. + +## Use Cases + +### Instruction Models + +A model trained for text generation can be later adapted to follow instructions. One of the most used open-source models for instruction is OpenAssistant, which you can try [at Hugging Chat](https://huggingface.co/chat). + +### Code Generation + +A Text Generation model, also known as a causal language model, can be trained on code from scratch to help the programmers in their repetitive coding tasks. One of the most popular open-source models for code generation is StarCoder, which can generate code in 80+ languages. You can try it [here](https://huggingface.co/spaces/bigcode/bigcode-playground). + +### Stories Generation + +A story generation model can receive an input like "Once upon a time" and proceed to create a story-like text based on those first words. You can try [this application](https://huggingface.co/spaces/mosaicml/mpt-7b-storywriter) which contains a model trained on story generation, by MosaicML. + +If your generative model training data is different than your use case, you can train a causal language model from scratch. Learn how to do it in the free transformers [course](https://huggingface.co/course/chapter7/6?fw=pt)! + +## Task Variants + +### Completion Generation Models + +A popular variant of Text Generation models predicts the next word given a bunch of words. Word by word a longer text is formed that results in for example: + +- Given an incomplete sentence, complete it. +- Continue a story given the first sentences. +- Provided a code description, generate the code. + +The most popular models for this task are GPT-based models or [Llama series](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). These models are trained on data that has no labels, so you just need plain text to train your own model. You can train text generation models to generate a wide variety of documents, from code to stories. + +### Text-to-Text Generation Models + +These models are trained to learn the mapping between a pair of texts (e.g. translation from one language to another). The most popular variants of these models are [FLAN-T5](https://huggingface.co/google/flan-t5-xxl), and [BART](https://huggingface.co/docs/transformers/model_doc/bart). Text-to-Text models are trained with multi-tasking capabilities, they can accomplish a wide range of tasks, including summarization, translation, and text classification. + +## Inference + +You can use the 🤗 Transformers library `text-generation` pipeline to do inference with Text Generation models. It takes an incomplete text and returns multiple outputs with which the text can be completed. + +```python +from transformers import pipeline +generator = pipeline('text-generation', model = 'gpt2') +generator("Hello, I'm a language model", max_length = 30, num_return_sequences=3) +## [{'generated_text': "Hello, I'm a language modeler. So while writing this, when I went out to meet my wife or come home she told me that my"}, +## {'generated_text': "Hello, I'm a language modeler. I write and maintain software in Python. I love to code, and that includes coding things that require writing"}, ... +``` + +[Text-to-Text generation models](https://huggingface.co/models?pipeline_tag=text2text-generation&sort=downloads) have a separate pipeline called `text2text-generation`. This pipeline takes an input containing the sentence including the task and returns the output of the accomplished task. + +```python +from transformers import pipeline + +text2text_generator = pipeline("text2text-generation") +text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything") +[{'generated_text': 'the answer to life, the universe and everything'}] + +text2text_generator("translate from English to French: I'm very happy") +[{'generated_text': 'Je suis très heureux'}] +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer text classification models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.conversational({ + model: "distilbert-base-uncased-finetuned-sst-2-english", + inputs: "I love this movie!", +}); +``` + +## Text Generation Inference + +[Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference) is an open-source toolkit for serving LLMs tackling challenges such as response time. TGI powers inference solutions like [Inference Endpoints](https://huggingface.co/inference-endpoints) and [Hugging Chat](https://huggingface.co/chat/), as well as multiple community projects. You can use it to deploy any supported open-source large language model of your choice. + +## ChatUI Spaces + +Hugging Face Spaces includes templates to easily deploy your own instance of a specific application. [ChatUI](https://github.com/huggingface/chat-ui) is an open-source interface that enables serving conversational interface for large language models and can be deployed with few clicks at Spaces. TGI powers these Spaces under the hood for faster inference. Thanks to the template, you can deploy your own instance based on a large language model with only a few clicks and customize it. Learn more about it [here](https://huggingface.co/docs/hub/spaces-sdks-docker-chatui) and create your large language model instance [here](https://huggingface.co/new-space?template=huggingchat/chat-ui-template). + +![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/os_llms/docker_chat.png) + +## Useful Resources + +Would you like to learn more about the topic? Awesome! Here you can find some curated resources that you may find helpful! + +### Tools within Hugging Face Ecosystem + +- You can use [PEFT](https://github.com/huggingface/peft) to adapt large language models in efficient way. +- [ChatUI](https://github.com/huggingface/chat-ui) is the open-source interface to conversate with Large Language Models. +- [text-generation-inferface](https://github.com/huggingface/text-generation-inference) +- [HuggingChat](https://huggingface.co/chat/) is a chat interface powered by Hugging Face to chat with powerful models like Llama 2 70B. + +### Documentation + +- [PEFT documentation](https://huggingface.co/docs/peft/index) +- [ChatUI Docker Spaces](https://huggingface.co/docs/hub/spaces-sdks-docker-chatui) +- [Causal language modeling task guide](https://huggingface.co/docs/transformers/tasks/language_modeling) +- [Text generation strategies](https://huggingface.co/docs/transformers/generation_strategies) + +### Course and Blogs + +- [Course Chapter on Training a causal language model from scratch](https://huggingface.co/course/chapter7/6?fw=pt) +- [TO Discussion with Victor Sanh](https://www.youtube.com/watch?v=Oy49SCW_Xpw&ab_channel=HuggingFace) +- [Hugging Face Course Workshops: Pretraining Language Models & CodeParrot](https://www.youtube.com/watch?v=ExUR7w6xe94&ab_channel=HuggingFace) +- [Training CodeParrot 🦜 from Scratch](https://huggingface.co/blog/codeparrot) +- [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) +- [Guiding Text Generation with Constrained Beam Search in 🤗 Transformers](https://huggingface.co/blog/constrained-beam-search) +- [Code generation with Hugging Face](https://huggingface.co/spaces/codeparrot/code-generation-models) +- [🌸 Introducing The World's Largest Open Multilingual Language Model: BLOOM 🌸](https://huggingface.co/blog/bloom) +- [The Technology Behind BLOOM Training](https://huggingface.co/blog/bloom-megatron-deepspeed) +- [Faster Text Generation with TensorFlow and XLA](https://huggingface.co/blog/tf-xla-generate) +- [Assisted Generation: a new direction toward low-latency text generation](https://huggingface.co/blog/assisted-generation) +- [Introducing RWKV - An RNN with the advantages of a transformer](https://huggingface.co/blog/rwkv) +- [Creating a Coding Assistant with StarCoder](https://huggingface.co/blog/starchat-alpha) +- [StarCoder: A State-of-the-Art LLM for Code](https://huggingface.co/blog/starcoder) +- [Open-Source Text Generation & LLM Ecosystem at Hugging Face](https://huggingface.co/blog/os-llms) +- [Llama 2 is at Hugging Face](https://huggingface.co/blog/llama2) + +### Notebooks + +- [Training a CLM in Flax](https://github.com/huggingface/notebooks/blob/master/examples/causal_language_modeling_flax.ipynb) +- [Training a CLM in TensorFlow](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch-tf.ipynb) +- [Training a CLM in PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb) + +### Scripts for training + +- [Training a CLM in PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling) +- [Training a CLM in TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling) +- [Text Generation in PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation) diff --git a/packages/tasks/src/text-generation/data.ts b/packages/tasks/src/text-generation/data.ts new file mode 100644 index 000000000..15b83ef0a --- /dev/null +++ b/packages/tasks/src/text-generation/data.ts @@ -0,0 +1,126 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A large multilingual dataset of text crawled from the web.", + id: "mc4", + }, + { + description: + "Diverse open-source data consisting of 22 smaller high-quality datasets. It was used to train GPT-Neo.", + id: "the_pile", + }, + { + description: "A crowd-sourced instruction dataset to develop an AI assistant.", + id: "OpenAssistant/oasst1", + }, + { + description: "A crowd-sourced instruction dataset created by Databricks employees.", + id: "databricks/databricks-dolly-15k", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "Once upon a time,", + type: "text", + }, + ], + outputs: [ + { + label: "Output", + content: + "Once upon a time, we knew that our ancestors were on the verge of extinction. The great explorers and poets of the Old World, from Alexander the Great to Chaucer, are dead and gone. A good many of our ancient explorers and poets have", + type: "text", + }, + ], + }, + metrics: [ + { + description: + "Cross Entropy is a metric that calculates the difference between two probability distributions. Each probability distribution is the distribution of predicted words", + id: "Cross Entropy", + }, + { + description: + "The Perplexity metric is the exponential of the cross-entropy loss. It evaluates the probabilities assigned to the next word by the model. Lower perplexity indicates better performance", + id: "Perplexity", + }, + ], + models: [ + { + description: "A large language model trained for text generation.", + id: "bigscience/bloom-560m", + }, + { + description: "A large code generation model that can generate code in 80+ languages.", + id: "bigcode/starcoder", + }, + { + description: "A model trained to follow instructions, uses Pythia-12b as base model.", + id: "databricks/dolly-v2-12b", + }, + { + description: "A model trained to follow instructions curated by community, uses Pythia-12b as base model.", + id: "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5", + }, + { + description: "A large language model trained to generate text in English.", + id: "stabilityai/stablelm-tuned-alpha-7b", + }, + { + description: "A model trained to follow instructions, based on mosaicml/mpt-7b.", + id: "mosaicml/mpt-7b-instruct", + }, + { + description: "A large language model trained to generate text in English.", + id: "EleutherAI/pythia-12b", + }, + { + description: "A large text-to-text model trained to follow instructions.", + id: "google/flan-ul2", + }, + { + description: "A large and powerful text generation model.", + id: "tiiuae/falcon-40b", + }, + { + description: "State-of-the-art open-source large language model.", + id: "meta-llama/Llama-2-70b-hf", + }, + ], + spaces: [ + { + description: "A robust text generation model that can perform various tasks through natural language prompting.", + id: "huggingface/bloom_demo", + }, + { + description: "An text generation based application that can write code for 80+ languages.", + id: "bigcode/bigcode-playground", + }, + { + description: "An text generation based application for conversations.", + id: "h2oai/h2ogpt-chatbot", + }, + { + description: "An text generation application that combines OpenAI and Hugging Face models.", + id: "microsoft/HuggingGPT", + }, + { + description: "An text generation application that uses StableLM-tuned-alpha-7b.", + id: "stabilityai/stablelm-tuned-alpha-chat", + }, + { + description: "An UI that uses StableLM-tuned-alpha-7b.", + id: "togethercomputer/OpenChatKit", + }, + ], + summary: + "Generating text is the task of producing new text. These models can, for example, fill in incomplete text or paraphrase.", + widgetModels: ["tiiuae/falcon-7b-instruct"], + youtubeId: "Vpjb1lu0MDk", +}; + +export default taskData; diff --git a/packages/tasks/src/text-to-image/about.md b/packages/tasks/src/text-to-image/about.md new file mode 100644 index 000000000..e7c79fb43 --- /dev/null +++ b/packages/tasks/src/text-to-image/about.md @@ -0,0 +1,66 @@ +## Use Cases + +### Data Generation + +Businesses can generate data for their their use cases by inputting text and getting image outputs. + +### Immersive Conversational Chatbots + +Chatbots can be made more immersive if they provide contextual images based on the input provided by the user. + +### Creative Ideas for Fashion Industry + +Different patterns can be generated to obtain unique pieces of fashion. Text-to-image models make creations easier for designers to conceptualize their design before actually implementing it. + +### Architecture Industry + +Architects can utilise the models to construct an environment based out on the requirements of the floor plan. This can also include the furniture that has to be placed in that environment. + +## Task Variants + +You can contribute variants of this task [here](https://github.com/huggingface/hub-docs/blob/main/tasks/src/text-to-image/about.md). + +## Inference + +You can use diffusers pipelines to infer with `text-to-image` models. + +```python +from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler + +model_id = "stabilityai/stable-diffusion-2" +scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler") +pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16) +pipe = pipe.to("cuda") + +prompt = "a photo of an astronaut riding a horse on mars" +image = pipe(prompt).images[0] +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer text-to-image models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.textToImage({ + model: "stabilityai/stable-diffusion-2", + inputs: "award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]", + parameters: { + negative_prompt: "blurry", + }, +}); +``` + +## Useful Resources + +- [Hugging Face Diffusion Models Course](https://github.com/huggingface/diffusion-models-class) +- [Getting Started with Diffusers](https://huggingface.co/docs/diffusers/index) +- [Text-to-Image Generation](https://huggingface.co/docs/diffusers/using-diffusers/conditional_image_generation) +- [MinImagen - Build Your Own Imagen Text-to-Image Model](https://www.assemblyai.com/blog/minimagen-build-your-own-imagen-text-to-image-model/) +- [Using LoRA for Efficient Stable Diffusion Fine-Tuning](https://huggingface.co/blog/lora) +- [Using Stable Diffusion with Core ML on Apple Silicon](https://huggingface.co/blog/diffusers-coreml) +- [A guide on Vector Quantized Diffusion](https://huggingface.co/blog/vq-diffusion) +- [🧨 Stable Diffusion in JAX/Flax](https://huggingface.co/blog/stable_diffusion_jax) +- [Running IF with 🧨 diffusers on a Free Tier Google Colab](https://huggingface.co/blog/if) + +This page was made possible thanks to the efforts of [Ishan Dutta](https://huggingface.co/ishandutta), [Enrique Elias Ubaldo](https://huggingface.co/herrius) and [Oğuz Akif](https://huggingface.co/oguzakif). diff --git a/packages/tasks/src/text-to-image/data.ts b/packages/tasks/src/text-to-image/data.ts new file mode 100644 index 000000000..0a6c41ac1 --- /dev/null +++ b/packages/tasks/src/text-to-image/data.ts @@ -0,0 +1,94 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "RedCaps is a large-scale dataset of 12M image-text pairs collected from Reddit.", + id: "red_caps", + }, + { + description: "Conceptual Captions is a dataset consisting of ~3.3M images annotated with captions.", + id: "conceptual_captions", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "A city above clouds, pastel colors, Victorian style", + type: "text", + }, + ], + outputs: [ + { + filename: "image.jpeg", + type: "img", + }, + ], + }, + metrics: [ + { + description: + "The Inception Score (IS) measure assesses diversity and meaningfulness. It uses a generated image sample to predict its label. A higher score signifies more diverse and meaningful images.", + id: "IS", + }, + { + description: + "The Fréchet Inception Distance (FID) calculates the distance between distributions between synthetic and real samples. A lower FID score indicates better similarity between the distributions of real and generated images.", + id: "FID", + }, + { + description: + "R-precision assesses how the generated image aligns with the provided text description. It uses the generated images as queries to retrieve relevant text descriptions. The top 'r' relevant descriptions are selected and used to calculate R-precision as r/R, where 'R' is the number of ground truth descriptions associated with the generated images. A higher R-precision value indicates a better model.", + id: "R-Precision", + }, + ], + models: [ + { + description: + "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.", + id: "CompVis/stable-diffusion-v1-4", + }, + { + description: + "A model that can be used to generate images based on text prompts. The DALL·E Mega model is the largest version of DALLE Mini.", + id: "dalle-mini/dalle-mega", + }, + { + description: "A text-to-image model that can generate coherent text inside image.", + id: "DeepFloyd/IF-I-XL-v1.0", + }, + { + description: "A powerful text-to-image model.", + id: "kakaobrain/karlo-v1-alpha", + }, + ], + spaces: [ + { + description: "A powerful text-to-image application.", + id: "stabilityai/stable-diffusion", + }, + { + description: "An text-to-image application that can generate coherent text inside the image.", + id: "DeepFloyd/IF", + }, + { + description: "An powerful text-to-image application that can generate images.", + id: "kakaobrain/karlo", + }, + { + description: "An powerful text-to-image application that can generates 3D representations.", + id: "hysts/Shap-E", + }, + { + description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.", + id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI", + }, + ], + summary: + "Generates images from input text. These models can be used to generate and modify images based on text prompts.", + widgetModels: ["CompVis/stable-diffusion-v1-4"], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/text-to-speech/about.md b/packages/tasks/src/text-to-speech/about.md new file mode 100644 index 000000000..5b2fc6530 --- /dev/null +++ b/packages/tasks/src/text-to-speech/about.md @@ -0,0 +1,62 @@ +## Use Cases + +Text-to-Speech (TTS) models can be used in any speech-enabled application that requires converting text to speech imitating human voice. + +### Voice Assistants + +TTS models are used to create voice assistants on smart devices. These models are a better alternative compared to concatenative methods where the assistant is built by recording sounds and mapping them, since the outputs in TTS models contain elements in natural speech such as emphasis. + +### Announcement Systems + +TTS models are widely used in airport and public transportation announcement systems to convert the announcement of a given text into speech. + +## Inference API + +The Hub contains over [1500 TTS models](https://huggingface.co/models?pipeline_tag=text-to-speech&sort=downloads) that you can use right away by trying out the widgets directly in the browser or calling the models as a service using the Inference API. Here is a simple code snippet to get you started: + +```python +import json +import requests + +headers = {"Authorization": f"Bearer {API_TOKEN}"} +API_URL = "https://api-inference.huggingface.co/models/microsoft/speecht5_tts" + +def query(payload): + response = requests.post(API_URL, headers=headers, json=payload) + return response + +output = query({"text_inputs": "This is a test"}) +``` + +You can also use libraries such as [espnet](https://huggingface.co/models?library=espnet&pipeline_tag=text-to-speech&sort=downloads) or [transformers](https://huggingface.co/models?pipeline_tag=text-to-speech&library=transformers&sort=trending) if you want to handle the Inference directly. + +## Direct Inference + +Now, you can also use the Text-to-Speech pipeline in Transformers to synthesise high quality voice. + +```python +from transformers import pipeline + +synthesizer = pipeline("text-to-speech", "suno/bark") + +synthesizer("Look I am generating speech in three lines of code!") +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer summarization models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.textToSpeech({ + model: "facebook/mms-tts", + inputs: "text to generate speech from", +}); +``` + +## Useful Resources + +- [ML for Audio Study Group - Text to Speech Deep Dive](https://www.youtube.com/watch?v=aLBedWj-5CQ) +- [An introduction to SpeechT5, a multi-purpose speech recognition and synthesis model](https://huggingface.co/blog/speecht5). +- [A guide on Fine-tuning Whisper For Multilingual ASR with 🤗Transformers](https://huggingface.co/blog/fine-tune-whisper) +- [Speech Synthesis, Recognition, and More With SpeechT5](https://huggingface.co/blog/speecht5) diff --git a/packages/tasks/src/text-to-speech/data.ts b/packages/tasks/src/text-to-speech/data.ts new file mode 100644 index 000000000..031aa96d3 --- /dev/null +++ b/packages/tasks/src/text-to-speech/data.ts @@ -0,0 +1,69 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "Thousands of short audio clips of a single speaker.", + id: "lj_speech", + }, + { + description: "Multi-speaker English dataset.", + id: "LibriTTS", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "I love audio models on the Hub!", + type: "text", + }, + ], + outputs: [ + { + filename: "audio.wav", + type: "audio", + }, + ], + }, + metrics: [ + { + description: "The Mel Cepstral Distortion (MCD) metric is used to calculate the quality of generated speech.", + id: "mel cepstral distortion", + }, + ], + models: [ + { + description: "A powerful TTS model.", + id: "suno/bark", + }, + { + description: "A massively multi-lingual TTS model.", + id: "facebook/mms-tts", + }, + { + description: "An end-to-end speech synthesis model.", + id: "microsoft/speecht5_tts", + }, + ], + spaces: [ + { + description: "An application for generate highly realistic, multilingual speech.", + id: "suno/bark", + }, + { + description: "An application that contains multiple speech synthesis models for various languages and accents.", + id: "coqui/CoquiTTS", + }, + { + description: "An application that synthesizes speech for various speaker types.", + id: "Matthijs/speecht5-tts-demo", + }, + ], + summary: + "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.", + widgetModels: ["microsoft/speecht5_tts"], + youtubeId: "NW62DpzJ274", +}; + +export default taskData; diff --git a/packages/tasks/src/text-to-video/about.md b/packages/tasks/src/text-to-video/about.md new file mode 100644 index 000000000..898d638c2 --- /dev/null +++ b/packages/tasks/src/text-to-video/about.md @@ -0,0 +1,41 @@ +## Use Cases + +### Script-based Video Generation + +Text-to-video models can be used to create short-form video content from a provided text script. These models can be used to create engaging and informative marketing videos. For example, a company could use a text-to-video model to create a video that explains how their product works. + +### Content format conversion + +Text-to-video models can be used to generate videos from long-form text, including blog posts, articles, and text files. Text-to-video models can be used to create educational videos that are more engaging and interactive. An example of this is creating a video that explains a complex concept from an article. + +### Voice-overs and Speech + +Text-to-video models can be used to create an AI newscaster to deliver daily news, or for a film-maker to create a short film or a music video. + +## Task Variants +Text-to-video models have different variants based on inputs and outputs. + +### Text-to-video Editing + +One text-to-video task is generating text-based video style and local attribute editing. Text-to-video editing models can make it easier to perform tasks like cropping, stabilization, color correction, resizing and audio editing consistently. + +### Text-to-video Search + +Text-to-video search is the task of retrieving videos that are relevant to a given text query. This can be challenging, as videos are a complex medium that can contain a lot of information. By using semantic analysis to extract the meaning of the text query, visual analysis to extract features from the videos, such as the objects and actions that are present in the video, and temporal analysis to categorize relationships between the objects and actions in the video, we can determine which videos are most likely to be relevant to the text query. + +### Text-driven Video Prediction + +Text-driven video prediction is the task of generating a video sequence from a text description. Text description can be anything from a simple sentence to a detailed story. The goal of this task is to generate a video that is both visually realistic and semantically consistent with the text description. + +### Video Translation + +Text-to-video translation models can translate videos from one language to another or allow to query the multilingual text-video model with non-English sentences. This can be useful for people who want to watch videos in a language that they don't understand, especially when multi-lingual captions are available for training. + +## Inference +Contribute an inference snippet for text-to-video here! + +## Useful Resources + +In this area, you can insert useful resources about how to train or use a model for this task. + +- [Text-to-Video: The Task, Challenges and the Current State](https://huggingface.co/blog/text-to-video) diff --git a/packages/tasks/src/text-to-video/data.ts b/packages/tasks/src/text-to-video/data.ts new file mode 100644 index 000000000..17fba749c --- /dev/null +++ b/packages/tasks/src/text-to-video/data.ts @@ -0,0 +1,102 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "Microsoft Research Video to Text is a large-scale dataset for open domain video captioning", + id: "iejMac/CLIP-MSR-VTT", + }, + { + description: "UCF101 Human Actions dataset consists of 13,320 video clips from YouTube, with 101 classes.", + id: "quchenyuan/UCF101-ZIP", + }, + { + description: "A high-quality dataset for human action recognition in YouTube videos.", + id: "nateraw/kinetics", + }, + { + description: "A dataset of video clips of humans performing pre-defined basic actions with everyday objects.", + id: "HuggingFaceM4/something_something_v2", + }, + { + description: + "This dataset consists of text-video pairs and contains noisy samples with irrelevant video descriptions", + id: "HuggingFaceM4/webvid", + }, + { + description: "A dataset of short Flickr videos for the temporal localization of events with descriptions.", + id: "iejMac/CLIP-DiDeMo", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "Darth Vader is surfing on the waves.", + type: "text", + }, + ], + outputs: [ + { + filename: "text-to-video-output.gif", + type: "img", + }, + ], + }, + metrics: [ + { + description: + "Inception Score uses an image classification model that predicts class labels and evaluates how distinct and diverse the images are. A higher score indicates better video generation.", + id: "is", + }, + { + description: + "Frechet Inception Distance uses an image classification model to obtain image embeddings. The metric compares mean and standard deviation of the embeddings of real and generated images. A smaller score indicates better video generation.", + id: "fid", + }, + { + description: + "Frechet Video Distance uses a model that captures coherence for changes in frames and the quality of each frame. A smaller score indicates better video generation.", + id: "fvd", + }, + { + description: + "CLIPSIM measures similarity between video frames and text using an image-text similarity model. A higher score indicates better video generation.", + id: "clipsim", + }, + ], + models: [ + { + description: "A strong model for video generation.", + id: "PAIR/text2video-zero-controlnet-canny-arcane", + }, + { + description: "A robust model for text-to-video generation.", + id: "damo-vilab/text-to-video-ms-1.7b", + }, + { + description: "A text-to-video generation model with high quality and smooth outputs.", + id: "cerspense/zeroscope_v2_576w", + }, + ], + spaces: [ + { + description: "An application that generates video from text.", + id: "fffiloni/zeroscope", + }, + { + description: "An application that generates video from image and text.", + id: "TempoFunk/makeavid-sd-jax", + }, + { + description: "An application that generates videos from text and provides multi-model support.", + id: "ArtGAN/Video-Diffusion-WebUI", + }, + ], + summary: + "Text-to-video models can be used in any application that requires generating consistent sequence of images from text. ", + widgetModels: [], + youtubeId: undefined, +}; + +export default taskData; diff --git a/packages/tasks/src/token-classification/about.md b/packages/tasks/src/token-classification/about.md new file mode 100644 index 000000000..9b0701385 --- /dev/null +++ b/packages/tasks/src/token-classification/about.md @@ -0,0 +1,76 @@ +## Use Cases + +### Information Extraction from Invoices + +You can extract entities of interest from invoices automatically using Named Entity Recognition (NER) models. Invoices can be read with Optical Character Recognition models and the output can be used to do inference with NER models. In this way, important information such as date, company name, and other named entities can be extracted. + +## Task Variants + +### Named Entity Recognition (NER) + +NER is the task of recognizing named entities in a text. These entities can be the names of people, locations, or organizations. The task is formulated as labeling each token with a class for each named entity and a class named "0" for tokens that do not contain any entities. The input for this task is text and the output is the annotated text with named entities. + +#### Inference + +You can use the 🤗 Transformers library `ner` pipeline to infer with NER models. + +```python +from transformers import pipeline + +classifier = pipeline("ner") +classifier("Hello I'm Omar and I live in Zürich.") +``` + +### Part-of-Speech (PoS) Tagging +In PoS tagging, the model recognizes parts of speech, such as nouns, pronouns, adjectives, or verbs, in a given text. The task is formulated as labeling each word with a part of the speech. + +#### Inference + +You can use the 🤗 Transformers library `token-classification` pipeline with a POS tagging model of your choice. The model will return a json with PoS tags for each token. + +```python +from transformers import pipeline + +classifier = pipeline("token-classification", model = "vblagoje/bert-english-uncased-finetuned-pos") +classifier("Hello I'm Omar and I live in Zürich.") +``` + +This is not limited to transformers! You can also use other libraries such as Stanza, spaCy, and Flair to do inference! Here is an example using a canonical [spaCy](https://hf.co/blog/spacy) model. + +```python +!pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl + +import en_core_web_sm + +nlp = en_core_web_sm.load() +doc = nlp("I'm Omar and I live in Zürich.") +for token in doc: + print(token.text, token.pos_, token.dep_, token.ent_type_) + +## I PRON nsubj +## 'm AUX ROOT +## Omar PROPN attr PERSON +### ... +``` + +## Useful Resources + +Would you like to learn more about token classification? Great! Here you can find some curated resources that you may find helpful! + +- [Course Chapter on Token Classification](https://huggingface.co/course/chapter7/2?fw=pt) +- [Blog post: Welcome spaCy to the Hugging Face Hub](https://huggingface.co/blog/spacy) + +### Notebooks + +- [PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb) +- [TensorFlow](https://github.com/huggingface/notebooks/blob/master/examples/token_classification-tf.ipynb) + +### Scripts for training + +- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) +- [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) +- [Flax](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification) + +### Documentation + +- [Token classification task guide](https://huggingface.co/docs/transformers/tasks/token_classification) diff --git a/packages/tasks/src/token-classification/data.ts b/packages/tasks/src/token-classification/data.ts new file mode 100644 index 000000000..cb3e211df --- /dev/null +++ b/packages/tasks/src/token-classification/data.ts @@ -0,0 +1,84 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A widely used dataset useful to benchmark named entity recognition models.", + id: "conll2003", + }, + { + description: + "A multilingual dataset of Wikipedia articles annotated for named entity recognition in over 150 different languages.", + id: "wikiann", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "My name is Omar and I live in Zürich.", + type: "text", + }, + ], + outputs: [ + { + text: "My name is Omar and I live in Zürich.", + tokens: [ + { + type: "PERSON", + start: 11, + end: 15, + }, + { + type: "GPE", + start: 30, + end: 36, + }, + ], + type: "text-with-tokens", + }, + ], + }, + metrics: [ + { + description: "", + id: "accuracy", + }, + { + description: "", + id: "recall", + }, + { + description: "", + id: "precision", + }, + { + description: "", + id: "f1", + }, + ], + models: [ + { + description: + "A robust performance model to identify people, locations, organizations and names of miscellaneous entities.", + id: "dslim/bert-base-NER", + }, + { + description: "Flair models are typically the state of the art in named entity recognition tasks.", + id: "flair/ner-english", + }, + ], + spaces: [ + { + description: + "An application that can recognizes entities, extracts noun chunks and recognizes various linguistic features of each token.", + id: "spacy/gradio_pipeline_visualizer", + }, + ], + summary: + "Token classification is a natural language understanding task in which a label is assigned to some tokens in a text. Some popular token classification subtasks are Named Entity Recognition (NER) and Part-of-Speech (PoS) tagging. NER models could be trained to identify specific entities in a text, such as dates, individuals and places; and PoS tagging would identify, for example, which words in a text are verbs, nouns, and punctuation marks.", + widgetModels: ["dslim/bert-base-NER"], + youtubeId: "wVHdVlPScxA", +}; + +export default taskData; diff --git a/packages/tasks/src/translation/about.md b/packages/tasks/src/translation/about.md new file mode 100644 index 000000000..39755a8db --- /dev/null +++ b/packages/tasks/src/translation/about.md @@ -0,0 +1,65 @@ +## Use Cases + +You can find over a thousand Translation models on the Hub, but sometimes you might not find a model for the language pair you are interested in. When this happen, you can use a pretrained multilingual Translation model like [mBART](https://huggingface.co/facebook/mbart-large-cc25) and further train it on your own data in a process called fine-tuning. + +### Multilingual conversational agents + +Translation models can be used to build conversational agents across different languages. This can be done in two ways. + +- **Translate the dataset to a new language.** You can translate a dataset of intents (inputs) and responses to the target language. You can then train a new intent classification model with this new dataset. This allows you to proofread responses in the target language and have better control of the chatbot's outputs. + +* **Translate the input and output of the agent.** You can use a Translation model in user inputs so that the chatbot can process it. You can then translate the output of the chatbot into the language of the user. This approach might be less reliable as the chatbot will generate responses that were not defined before. + +## Inference + +You can use the 🤗 Transformers library with the `translation_xx_to_yy` pattern where xx is the source language code and yy is the target language code. The default model for the pipeline is [t5-base](https://huggingface.co/t5-base) which under the hood adds a task prefix indicating the task itself, e.g. “translate: English to French”. + +```python +from transformers import pipeline +en_fr_translator = pipeline("translation_en_to_fr") +en_fr_translator("How old are you?") +## [{'translation_text': ' quel âge êtes-vous?'}] +``` + +If you’d like to use a specific model checkpoint that is from one specific language to another, you can also directly use the `translation` pipeline. + +```python +from transformers import pipeline + +model_checkpoint = "Helsinki-NLP/opus-mt-en-fr" +translator = pipeline("translation", model=model_checkpoint) +translator("How are you?") +# [{'translation_text': 'Comment allez-vous ?'}] +``` + +You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer translation models on Hugging Face Hub. + +```javascript +import { HfInference } from "@huggingface/inference"; + +const inference = new HfInference(HF_ACCESS_TOKEN); +await inference.translation({ + model: "t5-base", + inputs: "My name is Wolfgang and I live in Berlin", +}); +``` + +## Useful Resources + +Would you like to learn more about Translation? Great! Here you can find some curated resources that you may find helpful! + +- [Course Chapter on Translation](https://huggingface.co/course/chapter7/4?fw=pt) + +### Notebooks + +- [PyTorch](https://github.com/huggingface/notebooks/blob/master/examples/translation.ipynb) +- [TensorFlow](https://github.com/huggingface/notebooks/blob/master/examples/translation-tf.ipynb) + +### Scripts for training + +- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) +- [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) + +### Documentation + +- [Translation task guide](https://huggingface.co/docs/transformers/tasks/translation) diff --git a/packages/tasks/src/translation/data.ts b/packages/tasks/src/translation/data.ts new file mode 100644 index 000000000..7973a77bd --- /dev/null +++ b/packages/tasks/src/translation/data.ts @@ -0,0 +1,68 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A dataset of copyright-free books translated into 16 different languages.", + id: "opus_books", + }, + { + description: + "An example of translation between programming languages. This dataset consists of functions in Java and C#.", + id: "code_x_glue_cc_code_to_code_trans", + }, + ], + demo: { + inputs: [ + { + label: "Input", + content: "My name is Omar and I live in Zürich.", + type: "text", + }, + ], + outputs: [ + { + label: "Output", + content: "Mein Name ist Omar und ich wohne in Zürich.", + type: "text", + }, + ], + }, + metrics: [ + { + description: + "BLEU score is calculated by counting the number of shared single or subsequent tokens between the generated sequence and the reference. Subsequent n tokens are called “n-grams”. Unigram refers to a single token while bi-gram refers to token pairs and n-grams refer to n subsequent tokens. The score ranges from 0 to 1, where 1 means the translation perfectly matched and 0 did not match at all", + id: "bleu", + }, + { + description: "", + id: "sacrebleu", + }, + ], + models: [ + { + description: "A model that translates from English to French.", + id: "Helsinki-NLP/opus-mt-en-fr", + }, + { + description: + "A general-purpose Transformer that can be used to translate from English to German, French, or Romanian.", + id: "t5-base", + }, + ], + spaces: [ + { + description: "An application that can translate between 100 languages.", + id: "Iker/Translate-100-languages", + }, + { + description: "An application that can translate between English, Spanish and Hindi.", + id: "EuroPython2022/Translate-with-Bloom", + }, + ], + summary: "Translation is the task of converting text from one language to another.", + widgetModels: ["t5-small"], + youtubeId: "1JvfrvZgi6c", +}; + +export default taskData; diff --git a/packages/tasks/src/unconditional-image-generation/about.md b/packages/tasks/src/unconditional-image-generation/about.md new file mode 100644 index 000000000..e5a958552 --- /dev/null +++ b/packages/tasks/src/unconditional-image-generation/about.md @@ -0,0 +1,50 @@ +## About the Task + +Unconditional image generation is the task of generating new images without any specific input. The main goal of this is to create novel, original images that are not based on existing images. +This can be used for a variety of applications, such as creating new artistic images, improving image recognition algorithms, or generating photorealistic images for virtual reality environments. + +Unconditional image generation models usually start with a _seed_ that generates a _random noise vector_. The model will then use this vector to create an output image similar to the images used for training the model. + +An example of unconditional image generation would be generating the image of a face on a model trained with the [CelebA dataset](https://huggingface.co/datasets/huggan/CelebA-HQ) or [generating a butterfly](https://huggingface.co/spaces/huggan/butterfly-gan) on a model trained with the [Smithsonian Butterflies dataset](https://huggingface.co/datasets/ceyda/smithsonian_butterflies). + +[Generative adversarial networks](https://en.wikipedia.org/wiki/Generative_adversarial_network) and [Diffusion](https://huggingface.co/docs/diffusers/index) are common architectures for this task. + +## Use Cases + +Unconditional image generation can be used for a variety of applications. + +### Artistic Expression + +Unconditional image generation can be used to create novel, original artwork that is not based on any existing images. This can be used to explore new creative possibilities and produce unique, imaginative images. + +### Data Augmentation + +Unconditional image generation models can be used to generate new images to improve the performance of image recognition algorithms. This makes algorithms more robust and able to handle a broader range of images. + +### Virtual Reality + +Unconditional image generation models can be used to create photorealistic images that can be used in virtual reality environments. This makes the VR experience more immersive and realistic. + +### Medical Imaging + +Unconditional image generation models can generate new medical images, such as CT or MRI scans, that can be used to train and evaluate medical imaging algorithms. This can improve the accuracy and reliability of these algorithms. + +### Industrial Design + +Unconditional image generation models can generate new designs for products, such as clothing or furniture, that are not based on any existing designs. This way, designers can explore new creative possibilities and produce unique, innovative designs. + +## Model Hosting and Inference + +This section should have useful information about Model Hosting and Inference + +## Useful Resources + +- [Hugging Face Diffusion Models Course](https://github.com/huggingface/diffusion-models-class) +- [Getting Started with Diffusers](https://huggingface.co/docs/diffusers/index) +- [Unconditional Image Generation Training](https://huggingface.co/docs/diffusers/training/unconditional_training) + +### Training your own model in just a few seconds + +In this area, you can insert useful information about training the model + +This page was made possible thanks to the efforts of [Someet Sahoo](https://huggingface.co/Someet24) and [Juan Carlos Piñeros](https://huggingface.co/juancopi81). diff --git a/packages/tasks/src/unconditional-image-generation/data.ts b/packages/tasks/src/unconditional-image-generation/data.ts new file mode 100644 index 000000000..f9eeac7e4 --- /dev/null +++ b/packages/tasks/src/unconditional-image-generation/data.ts @@ -0,0 +1,72 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: + "The CIFAR-100 dataset consists of 60000 32x32 colour images in 100 classes, with 600 images per class.", + id: "cifar100", + }, + { + description: "Multiple images of celebrities, used for facial expression translation.", + id: "CelebA", + }, + ], + demo: { + inputs: [ + { + label: "Seed", + content: "42", + type: "text", + }, + { + label: "Number of images to generate:", + content: "4", + type: "text", + }, + ], + outputs: [ + { + filename: "unconditional-image-generation-output.jpeg", + type: "img", + }, + ], + }, + metrics: [ + { + description: + "The inception score (IS) evaluates the quality of generated images. It measures the diversity of the generated images (the model predictions are evenly distributed across all possible labels) and their 'distinction' or 'sharpness' (the model confidently predicts a single label for each image).", + id: "Inception score (IS)", + }, + { + description: + "The Fréchet Inception Distance (FID) evaluates the quality of images created by a generative model by calculating the distance between feature vectors for real and generated images.", + id: "Frećhet Inception Distance (FID)", + }, + ], + models: [ + { + description: + "High-quality image generation model trained on the CIFAR-10 dataset. It synthesizes images of the ten classes presented in the dataset using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics.", + id: "google/ddpm-cifar10-32", + }, + { + description: + "High-quality image generation model trained on the 256x256 CelebA-HQ dataset. It synthesizes images of faces using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics.", + id: "google/ddpm-celebahq-256", + }, + ], + spaces: [ + { + description: "An application that can generate realistic faces.", + id: "CompVis/celeba-latent-diffusion", + }, + ], + summary: + "Unconditional image generation is the task of generating images with no condition in any context (like a prompt text or another image). Once trained, the model will create images that resemble its training data distribution.", + widgetModels: [""], + // TODO: Add related video + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/video-classification/about.md b/packages/tasks/src/video-classification/about.md new file mode 100644 index 000000000..fc87585af --- /dev/null +++ b/packages/tasks/src/video-classification/about.md @@ -0,0 +1,57 @@ +## Use Cases + +Video classification models can be used to categorize what a video is all about. + +### Activity Recognition + +Video classification models are used to perform activity recognition which is useful for fitness applications. Activity recognition is also helpful for vision-impaired individuals especially when they're commuting. + +### Video Search + +Models trained in video classification can improve user experience by organizing and categorizing video galleries on the phone or in the cloud, on multiple keywords or tags. + +## Inference + +Below you can find code for inferring with a pre-trained video classification model. + +```python +from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification +from pytorchvideo.transforms import UniformTemporalSubsample +from pytorchvideo.data.encoded_video import EncodedVideo + + +# Load the video. +video = EncodedVideo.from_path("path_to_video.mp4") +video_data = video.get_clip(start_sec=0, end_sec=4.0)["video"] + +# Sub-sample a fixed set of frames and convert them to a NumPy array. +num_frames = 16 +subsampler = UniformTemporalSubsample(num_frames) +subsampled_frames = subsampler(video_data) +video_data_np = subsampled_frames.numpy().transpose(1, 2, 3, 0) + +# Preprocess the video frames. +inputs = feature_extractor(list(video_data_np), return_tensors="pt") + +# Run inference +with torch.no_grad(): + outputs = model(**inputs) + logits = outputs.logits + +# Model predicts one of the 400 Kinetics 400 classes +predicted_label = logits.argmax(-1).item() +print(model.config.id2label[predicted_label]) +# `eating spaghetti` (if you chose this video: +# https://hf.co/datasets/nielsr/video-demo/resolve/main/eating_spaghetti.mp4) +``` + +## Useful Resources + +- [Developing a simple video classification model](https://keras.io/examples/vision/video_classification) +- [Video classification with Transformers](https://keras.io/examples/vision/video_transformers) +- [Building a video archive](https://www.youtube.com/watch?v=_IeS1m8r6SY) +- [Video classification task guide](https://huggingface.co/docs/transformers/tasks/video_classification) + +### Creating your own video classifier in minutes + +- [Fine-tuning tutorial notebook (PyTorch)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb) diff --git a/packages/tasks/src/video-classification/data.ts b/packages/tasks/src/video-classification/data.ts new file mode 100644 index 000000000..2f6e4d935 --- /dev/null +++ b/packages/tasks/src/video-classification/data.ts @@ -0,0 +1,84 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + // TODO write proper description + description: "Benchmark dataset used for video classification with videos that belong to 400 classes.", + id: "kinetics400", + }, + ], + demo: { + inputs: [ + { + filename: "video-classification-input.gif", + type: "img", + }, + ], + outputs: [ + { + type: "chart", + data: [ + { + label: "Playing Guitar", + score: 0.514, + }, + { + label: "Playing Tennis", + score: 0.193, + }, + { + label: "Cooking", + score: 0.068, + }, + ], + }, + ], + }, + metrics: [ + { + description: "", + id: "accuracy", + }, + { + description: "", + id: "recall", + }, + { + description: "", + id: "precision", + }, + { + description: "", + id: "f1", + }, + ], + models: [ + { + // TO DO: write description + description: "Strong Video Classification model trained on the Kinects 400 dataset.", + id: "MCG-NJU/videomae-base-finetuned-kinetics", + }, + { + // TO DO: write description + description: "Strong Video Classification model trained on the Kinects 400 dataset.", + id: "microsoft/xclip-base-patch32", + }, + ], + spaces: [ + { + description: "An application that classifies video at different timestamps.", + id: "nateraw/lavila", + }, + { + description: "An application that classifies video.", + id: "fcakyon/video-classification", + }, + ], + summary: + "Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to.", + widgetModels: [], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/visual-question-answering/about.md b/packages/tasks/src/visual-question-answering/about.md new file mode 100644 index 000000000..7f96e1679 --- /dev/null +++ b/packages/tasks/src/visual-question-answering/about.md @@ -0,0 +1,48 @@ +## Use Cases + +### Aid the Visually Impaired Persons + +VQA models can be used to reduce visual barriers for visually impaired individuals by allowing them to get information about images from the web and the real world. + +### Education + +VQA models can be used to improve experiences at museums by allowing observers to directly ask questions they interested in. + +### Improved Image Retrieval + +Visual question answering models can be used to retrieve images with specific characteristics. For example, the user can ask "Is there a dog?" to find all images with dogs from a set of images. + +### Video Search + +Specific snippets/timestamps of a video can be retrieved based on search queries. For example, the user can ask "At which part of the video does the guitar appear?" and get a specific timestamp range from the whole video. + +## Task Variants + +### Video Question Answering + +Video Question Answering aims to answer questions asked about the content of a video. + +## Inference + +You can infer with Visual Question Answering models using the `vqa` (or `visual-question-answering`) pipeline. This pipeline requires [the Python Image Library (PIL)](https://pillow.readthedocs.io/en/stable/) to process images. You can install it with (`pip install pillow`). + +```python +from PIL import Image +from transformers import pipeline + +vqa_pipeline = pipeline("visual-question-answering") + +image = Image.open("elephant.jpeg") +question = "Is there an elephant?" + +vqa_pipeline(image, question, top_k=1) +#[{'score': 0.9998154044151306, 'answer': 'yes'}] +``` + +## Useful Resources + +- [An introduction to Visual Question Answering - AllenAI](https://blog.allenai.org/vanilla-vqa-adcaaaa94336) +- [Multi Modal Framework (MMF) - Meta Research](https://mmf.sh/docs/getting_started/video_overview/) + +The contents of this page are contributed by [ +Bharat Raghunathan](https://huggingface.co/bharat-raghunathan) and [Jose Londono Botero](https://huggingface.co/jlondonobo). diff --git a/packages/tasks/src/visual-question-answering/data.ts b/packages/tasks/src/visual-question-answering/data.ts new file mode 100644 index 000000000..83a7e6949 --- /dev/null +++ b/packages/tasks/src/visual-question-answering/data.ts @@ -0,0 +1,93 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A widely used dataset containing questions (with answers) about images.", + id: "Graphcore/vqa", + }, + { + description: "A dataset to benchmark visual reasoning based on text in images.", + id: "textvqa", + }, + ], + demo: { + inputs: [ + { + filename: "elephant.jpeg", + type: "img", + }, + { + label: "Question", + content: "What is in this image?", + type: "text", + }, + ], + outputs: [ + { + type: "chart", + data: [ + { + label: "elephant", + score: 0.97, + }, + { + label: "elephants", + score: 0.06, + }, + { + label: "animal", + score: 0.003, + }, + ], + }, + ], + }, + isPlaceholder: false, + metrics: [ + { + description: "", + id: "accuracy", + }, + { + description: + "Measures how much a predicted answer differs from the ground truth based on the difference in their semantic meaning.", + id: "wu-palmer similarity", + }, + ], + models: [ + { + description: "A visual question answering model trained to convert charts and plots to text.", + id: "google/deplot", + }, + { + description: + "A visual question answering model trained for mathematical reasoning and chart derendering from images.", + id: "google/matcha-base ", + }, + { + description: "A strong visual question answering that answers questions from book covers.", + id: "google/pix2struct-ocrvqa-large", + }, + ], + spaces: [ + { + description: "An application that can answer questions based on images.", + id: "nielsr/vilt-vqa", + }, + { + description: "An application that can caption images and answer questions about a given image. ", + id: "Salesforce/BLIP", + }, + { + description: "An application that can caption images and answer questions about a given image. ", + id: "vumichien/Img2Prompt", + }, + ], + summary: + "Visual Question Answering is the task of answering open-ended questions based on an image. They output natural language responses to natural language questions.", + widgetModels: ["dandelin/vilt-b32-finetuned-vqa"], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/src/zero-shot-classification/about.md b/packages/tasks/src/zero-shot-classification/about.md new file mode 100644 index 000000000..9b7ff3c48 --- /dev/null +++ b/packages/tasks/src/zero-shot-classification/about.md @@ -0,0 +1,40 @@ +## About the Task + +Zero Shot Classification is the task of predicting a class that wasn't seen by the model during training. This method, which leverages a pre-trained language model, can be thought of as an instance of [transfer learning](https://www.youtube.com/watch?v=BqqfQnyjmgg) which generally refers to using a model trained for one task in a different application than what it was originally trained for. This is particularly useful for situations where the amount of labeled data is small. + +In zero shot classification, we provide the model with a prompt and a sequence of text that describes what we want our model to do, in natural language. Zero-shot classification excludes any examples of the desired task being completed. This differs from single or few-shot classification, as these tasks include a single or a few examples of the selected task. + +Zero, single and few-shot classification seem to be an emergent feature of large language models. This feature seems to come about around model sizes of +100M parameters. The effectiveness of a model at a zero, single or few-shot task seems to scale with model size, meaning that larger models (models with more trainable parameters or layers) generally do better at this task. + +Here is an example of a zero-shot prompt for classifying the sentiment of a sequence of text: + +``` +Classify the following input text into one of the following three categories: [positive, negative, neutral] + +Input Text: Hugging Face is awesome for making all of these +state of the art models available! +Sentiment: positive + +``` + +One great example of this task with a nice off-the-shelf model is available at the widget of this page, where the user can input a sequence of text and candidate labels to the model. This is a _word level_ example of zero shot classification, more elaborate and lengthy generations are available with larger models. Testing these models out and getting a feel for prompt engineering is the best way to learn how to use them. + +## Inference + +You can use the 🤗 Transformers library zero-shot-classification pipeline to infer with zero shot text classification models. + +```python +from transformers import pipeline + +pipe = pipeline(model="facebook/bart-large-mnli") +pipe("I have a problem with my iphone that needs to be resolved asap!", + candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"], +) +# output +>>> {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]} +``` + +## Useful Resources + +- [Zero Shot Learning](https://joeddav.github.io/blog/2020/05/29/ZSL.html) +- [Hugging Face on Transfer Learning](https://huggingface.co/course/en/chapter1/4?fw=pt#transfer-learning) diff --git a/packages/tasks/src/zero-shot-classification/data.ts b/packages/tasks/src/zero-shot-classification/data.ts new file mode 100644 index 000000000..c949fd0a1 --- /dev/null +++ b/packages/tasks/src/zero-shot-classification/data.ts @@ -0,0 +1,66 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A widely used dataset used to benchmark multiple variants of text classification.", + id: "glue", + }, + { + description: + "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information.", + id: "MultiNLI", + }, + { + description: + "FEVER is a publicly available dataset for fact extraction and verification against textual sources.", + id: "FEVER", + }, + ], + demo: { + inputs: [ + { + label: "Text Input", + content: "Dune is the best movie ever.", + type: "text", + }, + { + label: "Candidate Labels", + content: "CINEMA, ART, MUSIC", + type: "text", + }, + ], + outputs: [ + { + type: "chart", + data: [ + { + label: "CINEMA", + score: 0.9, + }, + { + label: "ART", + score: 0.1, + }, + { + label: "MUSIC", + score: 0.0, + }, + ], + }, + ], + }, + metrics: [], + models: [ + { + description: "Powerful zero-shot text classification model", + id: "facebook/bart-large-mnli", + }, + ], + spaces: [], + summary: + "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.", + widgetModels: ["facebook/bart-large-mnli"], +}; + +export default taskData; diff --git a/packages/tasks/src/zero-shot-image-classification/about.md b/packages/tasks/src/zero-shot-image-classification/about.md new file mode 100644 index 000000000..0c4b28328 --- /dev/null +++ b/packages/tasks/src/zero-shot-image-classification/about.md @@ -0,0 +1,76 @@ +## About the Task + +Zero-shot image classification is a computer vision task to classify images into one of several classes, without any prior training or knowledge of the classes. + +Zero shot image classification works by transferring knowledge learnt during training of one model, to classify novel classes that was not present in the training data. So this is a variation of [transfer learning](https://www.youtube.com/watch?v=BqqfQnyjmgg). For instance, a model trained to differentiate cars from airplanes can be used to classify images of ships. + +The data in this learning paradigm consists of + +- Seen data - images and their corresponding labels +- Unseen data - only labels and no images +- Auxiliary information - additional information given to the model during training connecting the unseen and seen data. This can be in the form of textual description or word embeddings. + +## Use Cases + +### Image Retrieval + +Zero-shot learning resolves several challenges in image retrieval systems. For example, with the rapid growth of categories on the web, it is challenging to index images based on unseen categories. With zero-shot learning we can associate unseen categories to images by exploiting attributes to model the relationships among visual features and labels. + +### Action Recognition + +Action recognition is the task of identifying when a person in an image/video is performing a given action from a set of actions. If all the possible actions are not known beforehand, conventional deep learning models fail. With zero-shot learning, for a given domain of a set of actions, we can create a mapping connecting low-level features and a semantic description of auxiliary data to classify unknown classes of actions. + +## Task Variants + +You can contribute variants of this task [here](https://github.com/huggingface/hub-docs/blob/main/tasks/src/zero-shot-image-classification/about.md). + +## Inference + +The model can be loaded with the zero-shot-image-classification pipeline like so: + +```python +from transformers import pipeline +# More models in the model hub. +model_name = "openai/clip-vit-large-patch14-336" +classifier = pipeline("zero-shot-image-classification", model = model_name) +``` + +You can then use this pipeline to classify images into any of the class names you specify. You can specify more than two class labels too. + +```python +image_to_classify = "path_to_cat_and_dog_image.jpeg" +labels_for_classification = ["cat and dog", + "lion and cheetah", + "rabbit and lion"] +scores = classifier(image_to_classify, + candidate_labels = labels_for_classification) +``` + +The classifier would return a list of dictionaries after the inference which is stored in the variable `scores` in the code snippet above. Variable `scores` would look as follows: + +```python +[{'score': 0.9950482249259949, 'label': 'cat and dog'}, +{'score': 0.004863627254962921, 'label': 'rabbit and lion'}, +{'score': 8.816882473183796e-05, 'label': 'lion and cheetah'}] +``` + +The dictionary at the zeroth index of the list will contain the label with the highest score. + +```python +print(f"The highest score is {scores[0]['score']:.3f} for the label {scores[0]['label']}") +``` + +The output from the print statement above would look as follows: + +``` +The highest probability is 0.995 for the label cat and dog +``` + +## Useful Resources + +You can contribute useful resources about this task [here](https://github.com/huggingface/hub-docs/blob/main/tasks/src/zero-shot-image-classification/about.md). + +Check out [Zero-shot image classification task guide](https://huggingface.co/docs/transformers/tasks/zero_shot_image_classification). + +This page was made possible thanks to the efforts of [Shamima Hossain](https://huggingface.co/Shamima), [Haider Zaidi +](https://huggingface.co/chefhaider) and [Paarth Bhatnagar](https://huggingface.co/Paarth). diff --git a/packages/tasks/src/zero-shot-image-classification/data.ts b/packages/tasks/src/zero-shot-image-classification/data.ts new file mode 100644 index 000000000..be8da73d4 --- /dev/null +++ b/packages/tasks/src/zero-shot-image-classification/data.ts @@ -0,0 +1,77 @@ +import type { TaskDataCustom } from "../Types"; + +const taskData: TaskDataCustom = { + datasets: [ + { + // TODO write proper description + description: "", + id: "", + }, + ], + demo: { + inputs: [ + { + filename: "image-classification-input.jpeg", + type: "img", + }, + { + label: "Classes", + content: "cat, dog, bird", + type: "text", + }, + ], + outputs: [ + { + type: "chart", + data: [ + { + label: "Cat", + score: 0.664, + }, + { + label: "Dog", + score: 0.329, + }, + { + label: "Bird", + score: 0.008, + }, + ], + }, + ], + }, + metrics: [ + { + description: "Computes the number of times the correct label appears in top K labels predicted", + id: "top-K accuracy", + }, + ], + models: [ + { + description: "Robust image classification model trained on publicly available image-caption data.", + id: "openai/clip-vit-base-patch16", + }, + { + description: + "Robust image classification model trained on publicly available image-caption data trained on additional high pixel data for better performance.", + id: "openai/clip-vit-large-patch14-336", + }, + { + description: "Strong image classification model for biomedical domain.", + id: "microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224", + }, + ], + spaces: [ + { + description: + "An application that leverages zero shot image classification to find best captions to generate an image. ", + id: "pharma/CLIP-Interrogator", + }, + ], + summary: + "Zero shot image classification is the task of classifying previously unseen classes during training of a model.", + widgetModels: ["openai/clip-vit-large-patch14-336"], + youtubeId: "", +}; + +export default taskData; diff --git a/packages/tasks/tsconfig.json b/packages/tasks/tsconfig.json new file mode 100644 index 000000000..37823efde --- /dev/null +++ b/packages/tasks/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "allowSyntheticDefaultImports": true, + "lib": ["ES2022", "DOM"], + "module": "CommonJS", + "moduleResolution": "node", + "target": "ES2022", + "forceConsistentCasingInFileNames": true, + "strict": true, + "noImplicitAny": true, + "strictNullChecks": true, + "skipLibCheck": true, + "noImplicitOverride": true, + "outDir": "./dist" + }, + "include": ["src"], + "exclude": ["dist"] +} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index afe3816ef..8991ef9f1 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -5,3 +5,4 @@ packages: - "packages/shared" - "packages/agents" - "packages/languages" + - "packages/tasks"