Skip to content

Commit

Permalink
Merge pull request #18 from hangyav/feat/new_models
Browse files Browse the repository at this point in the history
New instruction tuned models
  • Loading branch information
hangyav authored Jan 18, 2024
2 parents 753edca + 1e340eb commit d89dfce
Show file tree
Hide file tree
Showing 9 changed files with 330 additions and 14 deletions.
46 changes: 42 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,31 @@ The following tools run on the local system:

```pip install git+https://github.com/PrithivirajDamodaran/Gramformer.git```
* hf_checker: Huggingface `text2text-generation` pipline based analyser. See the [flan-t5-large-grammar-synthesis](https://huggingface.co/pszemraj/flan-t5-large-grammar-synthesis) model for an example.
<details><summary>Models</summary>
<ul>
<li>pszemraj/grammar-synthesis-small</li>
<li>pszemraj/grammar-synthesis-large</li>
<li>pszemraj/flan-t5-large-grammar-synthesis</li>
<li>pszemraj/flan-t5-xl-grammar-synthesis</li>
<li>pszemraj/bart-base-grammar-synthesis</li>
</ul>
</details>
* hf_instruction_checker: Huggingface `text2text-generation` pipline based
analyser using instruction tuned models. See the Grammarly's
[CoEdIT](https://github.com/vipulraheja/coedit) model for an example. Supports
error checking and text generation, such as paraphrasing, through the `%HF%`
magic command (see the OpenAI analyser below).
<details><summary>Models</summary>
<ul>
<li>grammarly/coedit-large</li>
<li>grammarly/coedit-xl</li>
<li>grammarly/coedit-xl-composite</li>
<li>grammarly/coedit-xxl</li>
<li>jbochi/coedit-base</li>
<li>jbochi/coedit-small</li>
<li>jbochi/candle-coedit-quantized</li>
</ul>
</details>
* [hf_completion](https://huggingface.co/docs/transformers/task_summary#language-modeling): Huggingface `fill-mask` pipline based text completion.

### Tools using remote services
Expand Down Expand Up @@ -94,9 +119,9 @@ ssh <server> textlsp
## Configuration

Using textLSP within an editor depends on the editor of choice.
For a few examples how to setup language servers in general in some of the popular editors see [here](https://github.com/openlawlibrary/pygls/tree/master/examples/hello-world#editor-configurations) or take a look at the related documentation of your editor.
For a few examples how to set up language servers in general in some of the popular editors see [here](https://github.com/openlawlibrary/pygls/tree/master/examples/hello-world#editor-configurations) or take a look at the related documentation of your editor.

By default all analyzers are disabled in textLSP, they have to be turned on in the settings.
By default, all analyzers are disabled in textLSP, they have to be turned on in the settings.
Example configuration in lua for nvim (other editors should be set up accordingly):

```lua
Expand All @@ -121,10 +146,22 @@ textLSP = {
}
},
hf_checker = {
enabled = true,
enabled = false,
gpu = false,
quantize=32,
model='pszemraj/flan-t5-large-grammar-synthesis',
-- model='pszemraj/grammar-synthesis-large',
min_length=40,
check_text = {
on_open = false,
on_save = true,
on_change = false,
}
},
hf_instruction_checker = {
enabled = true,
gpu = false,
quantize=32,
model='grammarly/coedit-large',
min_length=40,
check_text = {
on_open = false,
Expand All @@ -135,6 +172,7 @@ textLSP = {
hf_completion = {
enabled = true,
gpu = false,
quantize=32,
model='bert-base-multilingual-cased',
topk=5,
},
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def read(fname):
'openai==1.6.1',
'transformers==4.36.2',
'sortedcontainers==2.4.0',
'bitsandbytes==0.42.0',
],
extras_require={
'dev': [
Expand Down
44 changes: 44 additions & 0 deletions tests/analysers/hf_instruction_checker_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pytest

from textLSP.analysers.hf_instruction_checker import HFInstructionCheckerAnalyser
from textLSP.documents.document import BaseDocument


@pytest.fixture
def analyser():
return HFInstructionCheckerAnalyser(
None,
{
HFInstructionCheckerAnalyser.CONFIGURATION_MODEL: 'grammarly/coedit-large',
},
'hf_checker',
)


@pytest.mark.parametrize('doc,exp', [
(
BaseDocument(
'DUMMY_URL',
'This is a short sentence.',
version=1,
),
False,
),
(
BaseDocument(
'DUMMY_URL',
'This is a long enough sentence with an eror or tvo.',
version=1,
),
True,
),
])
def test_simple(doc, exp, analyser):
res_diag, res_action = analyser._analyse_lines(doc.cleaned_source, doc)

if exp:
assert len(res_diag) > 0
assert len(res_action) > 0
else:
assert len(res_diag) == 0
assert len(res_action) == 0
34 changes: 29 additions & 5 deletions textLSP/analysers/hf_checker/hf_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Position,
TextEdit,
CodeAction,
MessageType,
)
from pygls.server import LanguageServer
from transformers import pipeline
Expand All @@ -18,8 +19,10 @@
Interval,
LINE_PATTERN,
TokenDiff,
ConfigurationError,
)
from ...documents.document import BaseDocument
from ... import nn_utils


logger = logging.getLogger(__name__)
Expand All @@ -29,19 +32,40 @@ class HFCheckerAnalyser(Analyser):
CONFIGURATION_GPU = 'gpu'
CONFIGURATION_MODEL = 'model'
CONFIGURATION_MIN_LENGTH = 'min_length'
CONFIGURATION_QUANTIZE = 'quantize'

SETTINGS_DEFAULT_GPU = False
SETTINGS_DEFAULT_MODEL = 'pszemraj/flan-t5-large-grammar-synthesis'
SETTINGS_DEFAULT_MIN_LENGTH = 40
SETTINGS_DEFAULT_MODEL = 'grammarly/coedit-large'
SETTINGS_DEFAULT_MIN_LENGTH = 0
SETTINGS_DEFAULT_QUANTIZE = 32

def __init__(self, language_server: LanguageServer, config: dict, name: str):
super().__init__(language_server, config, name)
self.corrector = pipeline(
use_gpu = self.config.get(self.CONFIGURATION_GPU, self.SETTINGS_DEFAULT_GPU)
device = nn_utils.get_device(use_gpu)

quanitze = self.config.setdefault(self.CONFIGURATION_QUANTIZE, self.SETTINGS_DEFAULT_QUANTIZE)
model_kwargs = dict()
try:
nn_utils.set_quantization_args(quanitze, device, model_kwargs)
except ConfigurationError as e:
language_server.show_message(
f'{self.name}: {str(e)}',
MessageType.Error,
)
self.config[self.CONFIGURATION_QUANTIZE] = 32

model = self.config.get(self.CONFIGURATION_MODEL, self.SETTINGS_DEFAULT_MODEL)
self._corrector = pipeline(
'text2text-generation',
self.config.get(self.CONFIGURATION_MODEL, self.SETTINGS_DEFAULT_MODEL),
device='cuda:0' if self.config.get(self.CONFIGURATION_GPU, self.SETTINGS_DEFAULT_GPU) else 'cpu',
model,
device=device,
model_kwargs=model_kwargs,
)

def corrector(self, text):
return self._corrector(text)

def _analyse_lines(self, text, doc, offset=0) -> Tuple[List[Diagnostic], List[CodeAction]]:
diagnostics = list()
code_actions = list()
Expand Down
21 changes: 20 additions & 1 deletion textLSP/analysers/hf_completion/hf_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
CodeAction,
)
from pygls.server import LanguageServer
from lsprotocol.types import MessageType
from transformers import pipeline

from ..analyser import Analyser
from ...types import ConfigurationError
from ... import nn_utils


logger = logging.getLogger(__name__)
Expand All @@ -23,19 +25,36 @@ class HFCompletionAnalyser(Analyser):
CONFIGURATION_MODEL = 'model'
CONFIGURATION_TOP_K = 'topk'
CONFIGURATION_CONTEXT_SIZE = 'context_size'
CONFIGURATION_QUANTIZE = 'quantize'

SETTINGS_DEFAULT_GPU = False
SETTINGS_DEFAULT_MODEL = 'bert-base-multilingual-cased'
SETTINGS_DEFAULT_TOP_K = 5
SETTINGS_DEFAULT_CONTEXT_SIZE = 50
SETTINGS_DEFAULT_QUANTIZE = 32

def __init__(self, language_server: LanguageServer, config: dict, name: str):
super().__init__(language_server, config, name)
use_gpu = self.config.get(self.CONFIGURATION_GPU, self.SETTINGS_DEFAULT_GPU)
device = nn_utils.get_device(use_gpu)

quanitze = self.config.setdefault(self.CONFIGURATION_QUANTIZE, self.SETTINGS_DEFAULT_QUANTIZE)
model_kwargs = dict()
try:
nn_utils.set_quantization_args(quanitze, device, model_kwargs)
except ConfigurationError as e:
language_server.show_message(
f'{self.name}: {str(e)}',
MessageType.Error,
)
self.config[self.CONFIGURATION_QUANTIZE] = 32

model = self.config.get(self.CONFIGURATION_MODEL, self.SETTINGS_DEFAULT_MODEL)
self.completor = pipeline(
'fill-mask',
model,
device='cuda:0' if self.config.get(self.CONFIGURATION_GPU, self.SETTINGS_DEFAULT_GPU) else 'cpu',
device=device,
model_kwargs=model_kwargs,
)
if self.completor.tokenizer.mask_token is None:
raise ConfigurationError(f'The tokenizer of {model} does not have a MASK token.')
Expand Down
1 change: 1 addition & 0 deletions textLSP/analysers/hf_instruction_checker/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .hf_instruction_checker import HFInstructionCheckerAnalyser
Loading

0 comments on commit d89dfce

Please sign in to comment.