diff --git a/lilac/signals/default_signals.py b/lilac/signals/default_signals.py index 8464819ce..a8d5834f8 100644 --- a/lilac/signals/default_signals.py +++ b/lilac/signals/default_signals.py @@ -10,6 +10,7 @@ from .concept_labels import ConceptLabelsSignal from .concept_scorer import ConceptSignal from .lang_detection import LangDetectionSignal +from .markdown_code_block import MarkdownCodeBlockSignal from .near_dup import NearDuplicateSignal from .ner import SpacyNER from .pii import PIISignal @@ -29,6 +30,7 @@ def register_default_signals() -> None: register_signal(NearDuplicateSignal) register_signal(LangDetectionSignal) register_signal(ClusterHDBScan) + register_signal(MarkdownCodeBlockSignal) # Embeddings. register_signal(Cohere) diff --git a/lilac/signals/markdown_code_block.py b/lilac/signals/markdown_code_block.py new file mode 100644 index 000000000..d0ad775ce --- /dev/null +++ b/lilac/signals/markdown_code_block.py @@ -0,0 +1,47 @@ +"""Finds markdown code blocks. + +NOTE: It would be great to use guesslang to detect the language automatically, however +there is a dependency conflict with typing extensions. +""" +import re +from typing import ClassVar, Iterable, Iterator, Optional, cast + +from typing_extensions import override + +from ..schema import Field, Item, RichData, field, span +from ..signal import TextSignal + +MARKDOWN_RE = '```([^\n ]*?)\n(.*?)\n```' + + +class MarkdownCodeBlockSignal(TextSignal): + """Finds markdown blocks in text. Emits the language of the block with the span.""" + + name: ClassVar[str] = 'markdown_code_block' + display_name: ClassVar[str] = 'Markdown Code Block Detection' + + @override + def fields(self) -> Field: + return field( + fields=[ + field( + dtype='string_span', + fields={'language': 'string'}, + ) + ] + ) + + @override + def compute(self, data: Iterable[RichData]) -> Iterator[Optional[Item]]: + markdown_re = re.compile(MARKDOWN_RE, re.MULTILINE | re.DOTALL) + for doc in data: + text = cast(str, doc) + # Get the spans + markdown_re_spans = markdown_re.finditer(text) + languages = markdown_re.findall(text) + + spans: list[Item] = [] + for re_span, (language, _) in zip(markdown_re_spans, languages): + spans.append(span(re_span.start(), re_span.end(), {'language': language})) + + yield spans diff --git a/lilac/signals/markdown_code_block_test.py b/lilac/signals/markdown_code_block_test.py new file mode 100644 index 000000000..3c9a6bd88 --- /dev/null +++ b/lilac/signals/markdown_code_block_test.py @@ -0,0 +1,82 @@ +"""Test the Markdown Extractor signal.""" + +from ..schema import field +from ..splitters.text_splitter_test_utils import text_to_expected_spans +from .markdown_code_block import MarkdownCodeBlockSignal + + +def test_markdown_code_block_fields() -> None: + signal = MarkdownCodeBlockSignal() + signal.setup() + assert signal.fields() == field(fields=[field('string_span', fields={'language': 'string'})]) + + +def test_markdown_code_block() -> None: + signal = MarkdownCodeBlockSignal() + signal.setup() + + text = """ +I am trying to add an extra field to my model form in Django. + +```python +class MyForm(forms.ModelForm): + extra_field = forms.CharField() + class Meta: + model = MyModel + widgets = { + 'extra_field': forms.Textarea(attrs={'placeholder': u'Bla bla'}), + } +``` +However, it appears that the widget definition for 'extra_field' in the Meta class is ignored. + +```py +class MyForm(forms.ModelForm): + extra_field = forms.CharField(widget=forms.Textarea()) + class Meta: + model = MyModel +``` + +Could you explain why my first approach does not work and what I am doing wrong? + +Here is the console output: +``` +fake output +``` + +""" + markdown_blocks = list(signal.compute([text])) + + expected_spans = text_to_expected_spans( + text, + [ + ( + """```python +class MyForm(forms.ModelForm): + extra_field = forms.CharField() + class Meta: + model = MyModel + widgets = { + 'extra_field': forms.Textarea(attrs={'placeholder': u'Bla bla'}), + } +```""", + {'language': 'python'}, + ), + ( + """```py +class MyForm(forms.ModelForm): + extra_field = forms.CharField(widget=forms.Textarea()) + class Meta: + model = MyModel +```""", + {'language': 'py'}, + ), + ( + """``` +fake output +```""", + {'language': ''}, + ), + ], + ) + + assert markdown_blocks == [expected_spans] diff --git a/notebooks/CurateCodingDataset.ipynb b/notebooks/CurateCodingDataset.ipynb index fedfc006d..3718b6f1c 100644 --- a/notebooks/CurateCodingDataset.ipynb +++ b/notebooks/CurateCodingDataset.ipynb @@ -193,7 +193,7 @@ "import lilac as ll\n", "from pprint import pprint\n", "\n", - "code_block_re = re.compile('```(py|python)\\n(.*)?\\n```', re.MULTILINE | re.DOTALL)\n", + "code_block_re = re.compile('```(py|python)\\n(.*?)\\n```', re.MULTILINE | re.DOTALL)\n", "\n", "\n", "# Format the code blocks of the \"answer\" column using the `ruff`` formatter.\n", diff --git a/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte b/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte index 4e49be103..57b6c0913 100644 --- a/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte +++ b/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte @@ -511,7 +511,7 @@ } } }); - } else if (renderSpan.isLeafSpan) { + } else if (renderSpan.isMetadata) { spanDecorations.push({ range, options: { @@ -678,7 +678,7 @@ @apply border-r-8 border-orange-700 opacity-20; } :global(.leaf-text) { - @apply font-extrabold text-violet-500 underline; + @apply text-violet-500; } /** Deep-linked selection */ diff --git a/web/blueprint/src/lib/components/datasetView/spanHighlight.ts b/web/blueprint/src/lib/components/datasetView/spanHighlight.ts index 252ace943..9c2778f56 100644 --- a/web/blueprint/src/lib/components/datasetView/spanHighlight.ts +++ b/web/blueprint/src/lib/components/datasetView/spanHighlight.ts @@ -57,6 +57,7 @@ export interface MonacoRenderSpan { isKeywordSearch: boolean; isConceptSearch: boolean; isSemanticSearch: boolean; + isMetadata: boolean; isLeafSpan: boolean; hasNonNumericMetadata: boolean; @@ -103,6 +104,7 @@ export function getMonacoRenderSpans( const isConceptSearch = valueInfo.type === 'concept_score'; const isSemanticSearch = valueInfo.type === 'semantic_similarity'; const hasNonNumericMetadata = valueInfo.type === 'metadata' && !isNumeric(valueInfo.dtype); + const isMetadata = valueInfo.type === 'metadata'; const isLeafSpan = valueInfo.type === 'leaf_span'; let isHighlighted = false; @@ -124,6 +126,7 @@ export function getMonacoRenderSpans( isKeywordSearch, isConceptSearch, isSemanticSearch, + isMetadata, isLeafSpan, hasNonNumericMetadata, namedValue,