Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add content_reference link parsing #276

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions src/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,23 @@ interface ApiSession {

type ModelSlug = 'text-davinci-002-render-sha' | 'text-davinci-002-render-paid' | 'text-davinci-002-browse' | 'gpt-4' | 'gpt-4-browsing' | 'gpt-4o'

export interface Reference {
matched_text: string
start_idx: number
end_idx: number
alt: string | null
prompt_text: string | null
type: 'webpage' | 'hidden' | 'sources_footnote'
invalid: boolean
title?: string
url?: string
snippet?: string
attributions?: string
attributions_debug?: string
pub_date?: number
attribution?: string
}

export interface Citation {
start_ix: number
end_ix: number
Expand Down Expand Up @@ -84,6 +101,7 @@ interface MessageMeta {
model_slug?: ModelSlug & (string & {})
parent_id?: string
timestamp_?: 'absolute' & (string & {})
content_references?: Reference[]
citations?: Citation[]
_cite_metadata?: CiteMetadata
}
Expand Down
20 changes: 15 additions & 5 deletions src/exporter/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ function conversationToHtml(conversation: ConversationResult, avatar: string, me

let postSteps: Array<(input: string) => string> = []
if (message.author.role === 'assistant') {
postSteps = [...postSteps, input => transformFootNotes(input, message.metadata)]
postSteps.push((input) => {
const matches = input.match(LatexRegex)

Expand Down Expand Up @@ -139,6 +138,7 @@ function conversationToHtml(conversation: ConversationResult, avatar: string, me

return transformed
})
postSteps = [...postSteps, input => transformFootNotes(input, message.metadata)]
}
if (message.author.role === 'user') {
postSteps = [...postSteps, input => `<p class="no-katex">${escapeHtml(input)}</p>`]
Expand Down Expand Up @@ -238,13 +238,23 @@ function transformFootNotes(
) {
// 【11†(PrintWiki)】
const footNoteMarkRegex = /【(\d+)†\((.+?)\)】/g
return input.replace(footNoteMarkRegex, (match, citeIndex, _evidenceText) => {
input = input.replace(footNoteMarkRegex, (match, citeIndex) => {
const citation = metadata?.citations?.find(cite => cite.metadata?.extra?.cited_message_idx === +citeIndex)
// We simply remove the foot note mark in html output
if (citation) return ''
return citation ? '' : match
})

return match
metadata?.content_references?.forEach((ref) => {
if (ref.type === 'webpage' && ref.matched_text) {
const tooltip = ref.snippet ?? ''
const linkText = (ref.attribution ?? ref.url?.match(/\/\/([^/]+)/)?.[1]) || 'Link'
const linkHtml = `<a href="${ref.url}" title="${tooltip}" target="_blank" rel="noopener noreferrer">${linkText}</a>`
const referenceRegex = new RegExp(ref.matched_text.replace(/([.*+?^${}()|\[\]\/\\])/g, '\\$&'), 'g')
input = input.replace(referenceRegex, linkHtml)
}
})
input = input.replace(/[\uE203\uE204]/g, '')

return input
}

/**
Expand Down
22 changes: 17 additions & 5 deletions src/exporter/markdown.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ function conversationToMarkdown(conversation: ConversationResult, metaList?: Exp
const date = new Date(timestamp * 1000)
// format: 20:12 / 08:12 PM
const conversationTime = date.toLocaleTimeString('en-US', { hour: '2-digit', minute: '2-digit', hour12: !timeStamp24H })
timestampHtml = `<time datetime="${date.toISOString()}" title="${date.toLocaleString()}">${conversationTime}</time>\n\n`
timestampHtml = `*(<time datetime="${date.toISOString()}" title="${date.toLocaleString()}">${conversationTime}</time>)*`
}

const author = transformAuthor(message.author)
Expand Down Expand Up @@ -167,7 +167,7 @@ function conversationToMarkdown(conversation: ConversationResult, metaList?: Exp
const postProcess = (input: string) => postSteps.reduce((acc, fn) => fn(acc), input)
const content = transformContent(message.content, message.metadata, postProcess)

return `#### ${author}:\n${timestampHtml}${content}`
return `#### ${author}: ${timestampHtml}\n\n${content}`
}).filter(Boolean).join('\n\n')

const markdown = `${frontMatter}# ${title}\n\n${content}`
Expand Down Expand Up @@ -209,14 +209,26 @@ function transformFootNotes(

return match
})

// Process references (replacing matched_text with markdown links or placeholders)
const processedOutput = (metadata?.content_references ?? []).reduce((text, ref) => {
if (ref.type === 'webpage' && ref.matched_text) {
const linkText = ref.alt ?? `[(${ref.attribution ?? 'Link'})](${ref.url})`
const referenceRegex = new RegExp(ref.matched_text.replace(/([.*+?^${}()|\[\]\/\\])/g, '\\$&'), 'g')
return text.replace(referenceRegex, linkText)
}
return text
}, output.replace(/[\uE203\uE204]/g, ''))

const citationText = citationList.map((citation) => {
const citeIndex = citation.metadata?.extra?.cited_message_idx ?? 1
const citeTitle = citation.metadata?.title ?? 'No title'
return `[^${citeIndex}]: ${citeTitle}`
const citeUrl = citation.metadata?.url ?? ''
return citeUrl ? `[^${citeIndex}]: [${citeTitle}](${citeUrl})` : `[^${citeIndex}]: ${citeTitle}`
}).join('\n')

// Foot notes are placed at the end of the conversation node, not the end of the whole document
return `${output}\n\n${citationText}`
// Footnotes are placed at the end of the conversation node, not the whole document
return `${processedOutput}\n\n${citationText}\n---`
}

/**
Expand Down
46 changes: 38 additions & 8 deletions src/template.html
Original file line number Diff line number Diff line change
Expand Up @@ -223,28 +223,57 @@
}

a {
color: var(--tw-prose-links);
font-size: 0.8rem;
text-decoration-line: underline;
text-underline-offset: 2px;
display: inline-block;
padding: 6px 6px;
border-radius: 12px;
background-color: var(--link-bg, #f1f1f1);
color: var(--link-color, #0d0d0d);
font-size: 0.9rem;
text-decoration: none;
text-align: center;
line-height: 1;
box-shadow: none;
transition: background-color 0.3s ease, color 0.3s ease;
}

a:hover {
background-color: var(--link-hover-bg, #e0e0e0);
}

[data-theme="dark"] a {
background-color: var(--link-bg-dark, #333333);
color: var(--link-color-dark, #ececec);
}

[data-theme="dark"] a:hover {
background-color: var(--link-hover-bg-dark, #444444);
}

.conversation-content > p:first-child,
ol:first-child {
margin-top: 0;
}

p>code, li>code {
p > code, li > code {
color: var(--tw-prose-code);
font-weight: 600;
font-size: .875em;
padding: 0.2em 0.4em;
border-radius: 4px;
background-color: var(--inline-code-bg, #f4f4f4);
font-family: "Fira Code", monospace;
}

[data-theme="dark"] p > code, [data-theme="dark"] li > code {
background-color: var(--inline-code-bg-dark, #2d2d2d);
color: var(--inline-code-color-dark, #e6e6e6);
}

p>code::before,
p>code::after,
li>code::before,
li>code::after {
content: "`";
content: none;
}

hr {
Expand All @@ -260,11 +289,12 @@
background-color: #000000;
overflow-x: auto;
margin: 0 0 1rem 0;
padding: 1rem;
border-radius: 0.375rem;
}

pre>code {
font-family: Söhne Mono, Monaco, Andale Mono, Ubuntu Mono, monospace !important;
font-family: "Fira Code", Monaco, Andale Mono, Ubuntu Mono, monospace !important;
font-weight: 400;
font-size: .875em;
line-height: 1.7142857;
Expand Down Expand Up @@ -575,7 +605,7 @@ <h1>
</h1>
<div class="conversation-export">
<p>Exported by
<a href="https://github.com/pionxzh/chatgpt-exporter.git">ChatGPT Exporter</a>
<a href="https://github.com/pionxzh/chatgpt-exporter">ChatGPT Exporter</a>
at {{time}}</p>
</div>
{{details}}
Expand Down
Loading