1
1
package rssole
2
2
3
3
import (
4
- "bytes"
5
4
"crypto/md5"
6
5
"encoding/hex"
6
+ "log/slog"
7
7
"net/url"
8
- "regexp"
9
8
"strings"
10
9
"sync"
11
10
11
+ htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
12
+ "github.com/gomarkdown/markdown"
13
+ "github.com/gomarkdown/markdown/html"
14
+ "github.com/gomarkdown/markdown/parser"
12
15
"github.com/k3a/html2text"
13
16
"github.com/mmcdole/gofeed"
14
- "golang.org/x/exp/slog"
15
- "golang.org/x/net/html"
17
+ "github.com/mpvl/unique"
16
18
)
17
19
18
20
type wrappedItem struct {
19
21
IsUnread bool
20
22
Feed * feed
21
23
* gofeed.Item
22
24
23
- summary * string
24
- description * string
25
- descriptionImagesForDedupe * []string
26
- images * []string
27
- onceDescription sync.Once
25
+ summary * string
26
+ description * string
27
+ images * []string
28
+ onceDescription sync.Once
28
29
}
29
30
30
31
func (w * wrappedItem ) MarkReadID () string {
@@ -46,14 +47,9 @@ func (w *wrappedItem) Images() []string {
46
47
47
48
images := []string {}
48
49
49
- // NOTE: we exclude images that already appear in the description (gibiz)
50
-
51
50
// standard supplied image
52
51
if w .Item .Image != nil {
53
- if ! w .isDescriptionImage (w .Item .Image .URL ) {
54
- // fmt.Println(w.Item.Image.URL)
55
- images = append (images , w .Item .Image .URL )
56
- }
52
+ images = append (images , w .Item .Image .URL )
57
53
}
58
54
59
55
// mastodon/gibiz images
@@ -62,11 +58,7 @@ func (w *wrappedItem) Images() []string {
62
58
for _ , v := range content {
63
59
if v .Attrs ["medium" ] == "image" {
64
60
imageURL := v .Attrs ["url" ]
65
- if ! w .isDescriptionImage (imageURL ) {
66
- // fmt.Println(w.Description())
67
- // fmt.Printf("%v = %+v\n", k, imageUrl)
68
- images = append (images , imageURL )
69
- }
61
+ images = append (images , imageURL )
70
62
}
71
63
}
72
64
}
@@ -91,34 +83,28 @@ func (w *wrappedItem) Images() []string {
91
83
}
92
84
}
93
85
94
- w .images = & images
95
-
96
- return * w .images
97
- }
86
+ // Now... remove any meta images that are embedded in the description.
87
+ // Ignore any query string args.
98
88
99
- func (w * wrappedItem ) isDescriptionImage (src string ) bool {
100
- // strip anything after ? to get rid of query string part
101
- srcNoQueryString := strings .Split (src , "?" )[0 ]
89
+ dedupedImages := []string {}
102
90
103
- if w .descriptionImagesForDedupe == nil {
104
- // force lazy load if it hasn't already
105
- _ = w .Description ()
106
- }
107
-
108
- for _ , v := range * w .descriptionImagesForDedupe {
109
- // fmt.Println(v, "==", src)
110
- if v == srcNoQueryString {
111
- return true
91
+ // Remove any image sources already within the description...
92
+ for _ , img := range images {
93
+ srcNoQueryString := strings .Split (img , "?" )[0 ]
94
+ if ! strings .Contains (w .Description (), srcNoQueryString ) {
95
+ dedupedImages = append (dedupedImages , img )
96
+ } else {
97
+ slog .Info ("dedeuped meta image as already found in content" , "src" , img )
112
98
}
113
99
}
114
100
115
- return false
116
- }
101
+ // Remove any internal duplicates within the list...
102
+ unique . Strings ( & dedupedImages )
117
103
118
- var (
119
- tagsToRemoveRe = regexp . MustCompile ( "script|style|link|meta|iframe|form" )
120
- attrsToRemoveRe = regexp . MustCompile ( "style|class|hx-.*|data-.*|srcset|width|height|sizes|loading|decoding|target" )
121
- )
104
+ w . images = & dedupedImages
105
+
106
+ return * w . images
107
+ }
122
108
123
109
func (w * wrappedItem ) Description () string {
124
110
w .onceDescription .Do (func () {
@@ -156,89 +142,32 @@ func (w *wrappedItem) Description() string {
156
142
}
157
143
}
158
144
159
- // try and sanitise any html
160
- doc , err := html .Parse (strings .NewReader (* desc ))
161
- if err != nil {
162
- // failed to sanitise, so just return as is...
163
- slog .Warn ("html.Parse failed, returning unsanitised content" , "error" , err )
145
+ // Now simplify the (potential) HTML by converting
146
+ // it to and from markdown.
164
147
165
- w .description = desc
166
- } else {
167
- w .descriptionImagesForDedupe = & []string {}
168
- toDelete := []* html.Node {}
169
-
170
- var f func (* html.Node )
171
- f = func (n * html.Node ) {
172
- // fmt.Println(n)
173
- if n .Type == html .ElementNode {
174
- // fmt.Println(n.Data)
175
- if tagsToRemoveRe .MatchString (n .Data ) {
176
- // fmt.Println("removing", n.Data, "tag")
177
- toDelete = append (toDelete , n )
178
-
179
- return
180
- }
181
-
182
- allowedAttrs := []html.Attribute {}
183
-
184
- for i := range n .Attr {
185
- if ! attrsToRemoveRe .MatchString (n .Attr [i ].Key ) {
186
- allowedAttrs = append (allowedAttrs , n .Attr [i ])
187
- }
188
- }
189
-
190
- n .Attr = allowedAttrs
191
-
192
- if n .Data == "a" {
193
- // fmt.Println("making", n.Data, "tag target new tab")
194
- n .Attr = append (n .Attr , html.Attribute {
195
- Namespace : "" ,
196
- Key : "target" ,
197
- Val : "_new" ,
198
- })
199
- // disable href if it starts with #
200
- for i := range n .Attr {
201
- if n .Attr [i ].Key == "href" && n .Attr [i ].Val [0 ] == '#' {
202
- n .Attr [i ].Key = "xxxhref" // easier than removing the attr
203
-
204
- break
205
- }
206
- }
207
- }
208
-
209
- if n .Data == "img" || n .Data == "svg" {
210
- // fmt.Println("making", n.Data, "tag style max-width 60%")
211
- n .Attr = append (n .Attr , html.Attribute {
212
- Namespace : "" ,
213
- Key : "style" ,
214
- Val : "max-width: 60%;" ,
215
- })
216
- // keep a note of images so we can de-dupe attached
217
- // images that also appear in the content.
218
- for _ , a := range n .Attr {
219
- if a .Key == "src" {
220
- // strip anything after ? to get rid of query string part
221
- bits := strings .Split (a .Val , "?" )
222
- * w .descriptionImagesForDedupe = append (* w .descriptionImagesForDedupe , bits [0 ])
223
- }
224
- }
225
- }
226
- }
148
+ // First convert rando HTML to Markdown....
149
+ doc , err := htmltomarkdown .ConvertString (* desc )
227
150
228
- for c := n .FirstChild ; c != nil ; c = c .NextSibling {
229
- f (c )
230
- }
231
- }
232
- f (doc )
151
+ switch {
152
+ case err != nil :
153
+ slog .Warn ("htmltomarkdown.ConvertString failed, returning unsanitised content" , "error" , err )
233
154
234
- for _ , n := range toDelete {
235
- n . Parent . RemoveChild ( n )
236
- }
155
+ w . description = desc
156
+ case doc == "" :
157
+ slog . Warn ( "htmltomarkdown.ConvertString result blank, using original." )
237
158
238
- renderBuf := bytes .NewBufferString ("" )
239
- _ = html .Render (renderBuf , doc )
240
- desc := renderBuf .String ()
241
- w .description = & desc
159
+ w .description = desc
160
+ default :
161
+ // parse markdown
162
+ p := parser .NewWithExtensions (parser .CommonExtensions | parser .AutoHeadingIDs | parser .NoEmptyLineBeforeBlock )
163
+ md := p .Parse ([]byte (doc ))
164
+
165
+ // render to HTML (we choose to exclude embedded images and rely on them being passed in metadata)
166
+ renderer := html .NewRenderer (html.RendererOptions {
167
+ Flags : html .CommonFlags | html .HrefTargetBlank ,
168
+ })
169
+ mdHTML := string (markdown .Render (md , renderer ))
170
+ w .description = & mdHTML
242
171
}
243
172
})
244
173
@@ -257,6 +186,8 @@ func (w *wrappedItem) Summary() string {
257
186
plainDesc = plainDesc [:maxDescriptionLength ]
258
187
}
259
188
189
+ plainDesc = strings .TrimSpace (plainDesc )
190
+
260
191
// if summary is identical to title return nothing
261
192
if plainDesc == w .Title {
262
193
plainDesc = ""
0 commit comments