-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
327 additions
and
93 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
"""Compute text statistics for a document.""" | ||
import re | ||
from typing import Iterable, Optional | ||
|
||
import spacy | ||
from pydantic import Field as PydanticField | ||
from typing_extensions import override | ||
|
||
from ..data.dataset_utils import lilac_span | ||
from ..schema import Field, Item, RichData, SignalInputType, field | ||
from .signal import TextSignal | ||
|
||
EMAILS_KEY = 'emails' | ||
NUM_EMAILS_KEY = 'num_emails' | ||
|
||
# This regex is a fully RFC 5322 regex for email addresses. | ||
# https://uibakery.io/regex-library/email-regex-python | ||
EMAIL_REGEX = re.compile( | ||
"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", | ||
re.IGNORECASE) | ||
|
||
|
||
class SpacyNER(TextSignal): | ||
"""Named entity recognition with spacy | ||
For details see: [spacy.io/models](https://spacy.io/models). | ||
""" # noqa: D415, D400 | ||
name = 'spacy_ner' | ||
display_name = 'Spacy Named Entity Recognition' | ||
|
||
model: Optional[str] = PydanticField( | ||
title='SpaCy package name or model path.', default='en_core_web_sm', description='') | ||
|
||
input_type = SignalInputType.TEXT | ||
compute_type = SignalInputType.TEXT | ||
|
||
_nlp: spacy.language.Language | ||
|
||
@override | ||
def setup(self) -> None: | ||
self._nlp = spacy.load( | ||
'en_core_web_sm', | ||
# Disable everything except the NER component. See: https://spacy.io/models | ||
disable=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']) | ||
|
||
@override | ||
def fields(self) -> Field: | ||
return field(fields=[field('string_span', fields={'label': 'string'})]) | ||
|
||
@override | ||
def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]: | ||
text_data = (row if isinstance(row, str) else '' for row in data) | ||
|
||
for doc in self._nlp.pipe(text_data): | ||
result = [lilac_span(ent.start_char, ent.end_char, {'label': ent.label_}) for ent in doc.ents] | ||
|
||
if result: | ||
yield result | ||
else: | ||
yield None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
"""Test the Spacy NER signal.""" | ||
|
||
from ..schema import field | ||
from .ner import SpacyNER | ||
from .splitters.text_splitter_test_utils import text_to_expected_spans | ||
|
||
|
||
def test_spacy_ner_fields() -> None: | ||
signal = SpacyNER() | ||
signal.setup() | ||
assert signal.fields() == field(fields=[field('string_span', fields={'label': 'string'})]) | ||
|
||
|
||
def test_ner() -> None: | ||
signal = SpacyNER() | ||
signal.setup() | ||
|
||
text = ('Net income was $9.4 million compared to the prior year of $2.7 million.' | ||
'Revenue exceeded twelve billion dollars, with a loss of $1b.') | ||
emails = list(signal.compute([text])) | ||
|
||
expected_spans = text_to_expected_spans(text, [ | ||
('$9.4 million', { | ||
'label': 'MONEY' | ||
}), | ||
('the prior year', { | ||
'label': 'DATE' | ||
}), | ||
('$2.7 million', { | ||
'label': 'MONEY' | ||
}), | ||
('twelve billion dollars', { | ||
'label': 'MONEY' | ||
}), | ||
('1b', { | ||
'label': 'MONEY' | ||
}), | ||
]) | ||
|
||
assert emails == [expected_spans] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import type {DatasetState} from '$lib/stores/datasetStore'; | ||
import type {DatasetViewStore} from '$lib/stores/datasetViewStore'; | ||
import type {SignalInfoWithTypedSchema} from '$lilac'; | ||
import type {SvelteComponent} from 'svelte'; | ||
import type {Readable} from 'svelte/store'; | ||
import type {SpanDetails} from './StringSpanDetails.svelte'; | ||
import StringSpanDetails from './StringSpanDetails.svelte'; | ||
|
||
export interface SpanClickInfo { | ||
details: () => SpanDetails; | ||
datasetViewStore: DatasetViewStore; | ||
datasetStore: Readable<DatasetState>; | ||
embeddings: SignalInfoWithTypedSchema[]; | ||
addConceptLabel: ( | ||
conceptName: string, | ||
conceptNamespace: string, | ||
text: string, | ||
label: boolean | ||
) => void; | ||
} | ||
|
||
export function spanClick(element: HTMLSpanElement, clickInfo: SpanClickInfo) { | ||
let spanDetailsComponent: SvelteComponent | undefined; | ||
let curClickInfo = clickInfo; | ||
element.addEventListener('click', e => showClickDetails(e)); | ||
function showClickDetails(e: MouseEvent) { | ||
spanDetailsComponent = new StringSpanDetails({ | ||
props: { | ||
details: curClickInfo.details(), | ||
clickPosition: {x: e.clientX, y: e.clientY}, | ||
datasetViewStore: curClickInfo.datasetViewStore, | ||
datasetStore: curClickInfo.datasetStore, | ||
embeddings: curClickInfo.embeddings, | ||
addConceptLabel: curClickInfo.addConceptLabel | ||
}, | ||
target: document.body | ||
}); | ||
spanDetailsComponent.$on('close', destroyClickInfo); | ||
spanDetailsComponent.$on('click', destroyClickInfo); | ||
} | ||
|
||
function destroyClickInfo() { | ||
spanDetailsComponent?.$destroy(); | ||
spanDetailsComponent = undefined; | ||
} | ||
|
||
return { | ||
update(clickInfo: SpanClickInfo) { | ||
curClickInfo = clickInfo; | ||
|
||
spanDetailsComponent?.$set({ | ||
details: curClickInfo.details() | ||
}); | ||
}, | ||
destroy() { | ||
destroyClickInfo(); | ||
} | ||
}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.