Skip to content

Commit

Permalink
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 3 deletions.
2 changes: 2 additions & 0 deletions lilac/signals/default_signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .concept_labels import ConceptLabelsSignal
from .concept_scorer import ConceptSignal
from .lang_detection import LangDetectionSignal
from .markdown_code_block import MarkdownCodeBlockSignal
from .near_dup import NearDuplicateSignal
from .ner import SpacyNER
from .pii import PIISignal
Expand All @@ -29,6 +30,7 @@ def register_default_signals() -> None:
register_signal(NearDuplicateSignal)
register_signal(LangDetectionSignal)
register_signal(ClusterHDBScan)
register_signal(MarkdownCodeBlockSignal)

# Embeddings.
register_signal(Cohere)
Expand Down
47 changes: 47 additions & 0 deletions lilac/signals/markdown_code_block.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""Finds markdown code blocks.
NOTE: It would be great to use guesslang to detect the language automatically, however
there is a dependency conflict with typing extensions.
"""
import re
from typing import ClassVar, Iterable, Iterator, Optional, cast

from typing_extensions import override

from ..schema import Field, Item, RichData, field, span
from ..signal import TextSignal

MARKDOWN_RE = '```([^\n ]*?)\n(.*?)\n```'


class MarkdownCodeBlockSignal(TextSignal):
"""Finds markdown blocks in text. Emits the language of the block with the span."""

name: ClassVar[str] = 'markdown_code_block'
display_name: ClassVar[str] = 'Markdown Code Block Detection'

@override
def fields(self) -> Field:
return field(
fields=[
field(
dtype='string_span',
fields={'language': 'string'},
)
]
)

@override
def compute(self, data: Iterable[RichData]) -> Iterator[Optional[Item]]:
markdown_re = re.compile(MARKDOWN_RE, re.MULTILINE | re.DOTALL)
for doc in data:
text = cast(str, doc)
# Get the spans
markdown_re_spans = markdown_re.finditer(text)
languages = markdown_re.findall(text)

spans: list[Item] = []
for re_span, (language, _) in zip(markdown_re_spans, languages):
spans.append(span(re_span.start(), re_span.end(), {'language': language}))

yield spans
82 changes: 82 additions & 0 deletions lilac/signals/markdown_code_block_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Test the Markdown Extractor signal."""

from ..schema import field
from ..splitters.text_splitter_test_utils import text_to_expected_spans
from .markdown_code_block import MarkdownCodeBlockSignal


def test_markdown_code_block_fields() -> None:
signal = MarkdownCodeBlockSignal()
signal.setup()
assert signal.fields() == field(fields=[field('string_span', fields={'language': 'string'})])


def test_markdown_code_block() -> None:
signal = MarkdownCodeBlockSignal()
signal.setup()

text = """
I am trying to add an extra field to my model form in Django.
```python
class MyForm(forms.ModelForm):
extra_field = forms.CharField()
class Meta:
model = MyModel
widgets = {
'extra_field': forms.Textarea(attrs={'placeholder': u'Bla bla'}),
}
```
However, it appears that the widget definition for 'extra_field' in the Meta class is ignored.
```py
class MyForm(forms.ModelForm):
extra_field = forms.CharField(widget=forms.Textarea())
class Meta:
model = MyModel
```
Could you explain why my first approach does not work and what I am doing wrong?
Here is the console output:
```
fake output
```
"""
markdown_blocks = list(signal.compute([text]))

expected_spans = text_to_expected_spans(
text,
[
(
"""```python
class MyForm(forms.ModelForm):
extra_field = forms.CharField()
class Meta:
model = MyModel
widgets = {
'extra_field': forms.Textarea(attrs={'placeholder': u'Bla bla'}),
}
```""",
{'language': 'python'},
),
(
"""```py
class MyForm(forms.ModelForm):
extra_field = forms.CharField(widget=forms.Textarea())
class Meta:
model = MyModel
```""",
{'language': 'py'},
),
(
"""```
fake output
```""",
{'language': ''},
),
],
)

assert markdown_blocks == [expected_spans]
2 changes: 1 addition & 1 deletion notebooks/CurateCodingDataset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@
"import lilac as ll\n",
"from pprint import pprint\n",
"\n",
"code_block_re = re.compile('```(py|python)\\n(.*)?\\n```', re.MULTILINE | re.DOTALL)\n",
"code_block_re = re.compile('```(py|python)\\n(.*?)\\n```', re.MULTILINE | re.DOTALL)\n",
"\n",
"\n",
"# Format the code blocks of the \"answer\" column using the `ruff`` formatter.\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@
}
}
});
} else if (renderSpan.isLeafSpan) {
} else if (renderSpan.isMetadata) {
spanDecorations.push({
range,
options: {
Expand Down Expand Up @@ -678,7 +678,7 @@
@apply border-r-8 border-orange-700 opacity-20;
}
:global(.leaf-text) {
@apply font-extrabold text-violet-500 underline;
@apply text-violet-500;
}
/** Deep-linked selection */
Expand Down
3 changes: 3 additions & 0 deletions web/blueprint/src/lib/components/datasetView/spanHighlight.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export interface MonacoRenderSpan {
isKeywordSearch: boolean;
isConceptSearch: boolean;
isSemanticSearch: boolean;
isMetadata: boolean;
isLeafSpan: boolean;
hasNonNumericMetadata: boolean;

Expand Down Expand Up @@ -103,6 +104,7 @@ export function getMonacoRenderSpans(
const isConceptSearch = valueInfo.type === 'concept_score';
const isSemanticSearch = valueInfo.type === 'semantic_similarity';
const hasNonNumericMetadata = valueInfo.type === 'metadata' && !isNumeric(valueInfo.dtype);
const isMetadata = valueInfo.type === 'metadata';
const isLeafSpan = valueInfo.type === 'leaf_span';

let isHighlighted = false;
Expand All @@ -124,6 +126,7 @@ export function getMonacoRenderSpans(
isKeywordSearch,
isConceptSearch,
isSemanticSearch,
isMetadata,
isLeafSpan,
hasNonNumericMetadata,
namedValue,
Expand Down

0 comments on commit 649c756

Please sign in to comment.