diff --git a/.changeset/dull-hotels-beam.md b/.changeset/dull-hotels-beam.md new file mode 100644 index 00000000000..40a41470ec4 --- /dev/null +++ b/.changeset/dull-hotels-beam.md @@ -0,0 +1,5 @@ +--- +'@atproto/api': patch +--- + +Prevent hashtag emoji from being parsed as a tag diff --git a/.changeset/short-suits-destroy.md b/.changeset/short-suits-destroy.md new file mode 100644 index 00000000000..210d9f00468 --- /dev/null +++ b/.changeset/short-suits-destroy.md @@ -0,0 +1,5 @@ +--- +'@atproto/api': patch +--- + +Properly calculate length of tag diff --git a/packages/api/src/rich-text/detection.ts b/packages/api/src/rich-text/detection.ts index 25edcd9e57b..7b5444a68a5 100644 --- a/packages/api/src/rich-text/detection.ts +++ b/packages/api/src/rich-text/detection.ts @@ -70,27 +70,25 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined { } } { - const re = /(?:^|\s)(#[^\d\s]\S*)(?=\s)?/g + const re = /(^|\s)#((?!\ufe0f)[^\d\s]\S*)(?=\s)?/g while ((match = re.exec(text.utf16))) { - let [tag] = match - const hasLeadingSpace = /^\s/.test(tag) + let [, leading, tag] = match tag = tag.trim().replace(/\p{P}+$/gu, '') // strip ending punctuation - // inclusive of #, max of 64 chars - if (tag.length > 66) continue + if (tag.length === 0 || tag.length > 64) continue - const index = match.index + (hasLeadingSpace ? 1 : 0) + const index = match.index + leading.length facets.push({ index: { byteStart: text.utf16IndexToUtf8Index(index), - byteEnd: text.utf16IndexToUtf8Index(index + tag.length), // inclusive of last char + byteEnd: text.utf16IndexToUtf8Index(index + 1 + tag.length), }, features: [ { $type: 'app.bsky.richtext.facet#tag', - tag: tag.replace(/^#/, ''), + tag: tag, }, ], }) diff --git a/packages/api/tests/rich-text-detection.test.ts b/packages/api/tests/rich-text-detection.test.ts index 9498005076c..b83a841405b 100644 --- a/packages/api/tests/rich-text-detection.test.ts +++ b/packages/api/tests/rich-text-detection.test.ts @@ -241,15 +241,16 @@ describe('detectFacets', () => { ['body #1', [], []], ['body #a1', ['a1'], [{ byteStart: 5, byteEnd: 8 }]], ['#', [], []], + ['#?', [], []], ['text #', [], []], ['text # text', [], []], [ - 'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', - ['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], - [{ byteStart: 5, byteEnd: 71 }], + 'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', + ['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], + [{ byteStart: 5, byteEnd: 70 }], ], [ - 'body #thisisa65characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab', + 'body #thisisa65characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab', [], [], ], @@ -297,6 +298,17 @@ describe('detectFacets', () => { { byteStart: 17, byteEnd: 22 }, ], ], + ['this #️⃣tag should not be a tag', [], []], + [ + 'this ##️⃣tag should be a tag', + ['#️⃣tag'], + [ + { + byteStart: 5, + byteEnd: 16, + }, + ], + ], ] for (const [input, tags, indices] of inputs) {