Skip to content

Commit 6399333

Browse files
authored
A new way of sanitizing HTML, updated deps. (#143)
* updated deps * Convert to and from Markdown to sanitise HTML * dedupe the final image list against embedded images in content
1 parent 5717e6e commit 6399333

File tree

8 files changed

+106
-193
lines changed

8 files changed

+106
-193
lines changed

badge.svg

Lines changed: 1 addition & 1 deletion
Loading

go.mod

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,27 @@
11
module github.com/TheMightyGit/rssole
22

3-
go 1.22
3+
go 1.23
4+
5+
toolchain go1.23.2
46

57
require (
8+
github.com/JohannesKaufmann/html-to-markdown/v2 v2.1.0
69
github.com/NYTimes/gziphandler v1.1.1
710
github.com/andybalholm/cascadia v1.3.2
11+
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81
812
github.com/k3a/html2text v1.2.1
913
github.com/mmcdole/gofeed v1.3.0
10-
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948
11-
golang.org/x/net v0.28.0
14+
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de
15+
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f
16+
golang.org/x/net v0.31.0
1217
)
1318

1419
require (
15-
github.com/PuerkitoBio/goquery v1.9.2 // indirect
20+
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 // indirect
21+
github.com/PuerkitoBio/goquery v1.10.0 // indirect
1622
github.com/json-iterator/go v1.1.12 // indirect
1723
github.com/mmcdole/goxpp v1.1.1 // indirect
1824
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
1925
github.com/modern-go/reflect2 v1.0.2 // indirect
20-
golang.org/x/text v0.17.0 // indirect
26+
golang.org/x/text v0.20.0 // indirect
2127
)

go.sum

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
1+
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 h1:TDlO/A2QqlNhdvH+hDnu8cv1rouhfHgLwhGzJeHGgFQ=
2+
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364/go.mod h1:U+fBZLZTYiZCOwQUT04V3J4I+0TxyLNnj0R8nBlO4fk=
3+
github.com/JohannesKaufmann/html-to-markdown/v2 v2.1.0 h1:k6vBBqTmQOqLnaYkELgCU/F9xVPt3xhO1754hvlP/HM=
4+
github.com/JohannesKaufmann/html-to-markdown/v2 v2.1.0/go.mod h1:djCj8ehU80KpSAepQciLcNzrp8hwZ1vQFnYKRo4/Cio=
15
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
26
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
3-
github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
4-
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
5-
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
6-
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
7-
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
7+
github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4=
8+
github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4=
89
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
910
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
1011
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
1112
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
1213
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
14+
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81 h1:5lyLWsV+qCkoYqsKUDuycESh9DEIPVKN6iCFeL7ag50=
15+
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA=
1316
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
1417
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8=
1518
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
@@ -19,21 +22,23 @@ github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7
1922
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
2023
github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY=
2124
github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA=
22-
github.com/mmcdole/gofeed v1.2.1 h1:tPbFN+mfOLcM1kDF1x2c/N68ChbdBatkppdzf/vDe1s=
23-
github.com/mmcdole/gofeed v1.2.1/go.mod h1:2wVInNpgmC85q16QTTuwbuKxtKkHLCDDtf0dCmnrNr4=
2425
github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4=
2526
github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE=
26-
github.com/mmcdole/goxpp v1.1.0 h1:WwslZNF7KNAXTFuzRtn/OKZxFLJAAyOA9w82mDz2ZGI=
27-
github.com/mmcdole/goxpp v1.1.0/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
2827
github.com/mmcdole/goxpp v1.1.1 h1:RGIX+D6iQRIunGHrKqnA2+700XMCnNv0bAOOv5MUhx8=
2928
github.com/mmcdole/goxpp v1.1.1/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
3029
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
3130
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
3231
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
3332
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
3433
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
34+
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de h1:D5x39vF5KCwKQaw+OC9ZPiLVHXz3UFw2+psEX+gYcto=
35+
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de/go.mod h1:kJun4WP5gFuHZgRjZUWWuH1DTxCtxbHDOIJsudS8jzY=
3536
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
3637
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
38+
github.com/sebdah/goldie/v2 v2.5.5 h1:rx1mwF95RxZ3/83sdS4Yp7t2C5TCokvWP4TBRbAyEWY=
39+
github.com/sebdah/goldie/v2 v2.5.5/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
40+
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
41+
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
3742
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
3843
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
3944
github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
@@ -43,32 +48,27 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
4348
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
4449
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
4550
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
51+
github.com/yuin/goldmark v1.7.8 h1:iERMLn0/QJeHFhxSt3p6PeN9mGnvIKSpG9YYorDMnic=
52+
github.com/yuin/goldmark v1.7.8/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
4653
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
4754
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
48-
golang.org/x/exp v0.0.0-20240213143201-ec583247a57a h1:HinSgX1tJRX3KsL//Gxynpw5CTOAIPhgL4W8PNiIpVE=
49-
golang.org/x/exp v0.0.0-20240213143201-ec583247a57a/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc=
50-
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 h1:kx6Ds3MlpiUHKj7syVnbp57++8WpuKPcR5yjLBjvLEA=
51-
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ=
55+
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo=
56+
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak=
5257
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
5358
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
5459
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
5560
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
5661
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
57-
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
5862
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
5963
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
60-
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
6164
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
62-
golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
63-
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
64-
golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
65-
golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
65+
golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
66+
golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=
6667
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
6768
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
6869
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
6970
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
7071
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
71-
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
7272
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
7373
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
7474
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -80,14 +80,11 @@ golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
8080
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
8181
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
8282
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
83-
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
8483
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
8584
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
8685
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
87-
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
88-
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
89-
golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc=
90-
golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
86+
golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
87+
golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
9188
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
9289
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
9390
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=

internal/rssole/item.go

Lines changed: 52 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,31 @@
11
package rssole
22

33
import (
4-
"bytes"
54
"crypto/md5"
65
"encoding/hex"
6+
"log/slog"
77
"net/url"
8-
"regexp"
98
"strings"
109
"sync"
1110

11+
htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
12+
"github.com/gomarkdown/markdown"
13+
"github.com/gomarkdown/markdown/html"
14+
"github.com/gomarkdown/markdown/parser"
1215
"github.com/k3a/html2text"
1316
"github.com/mmcdole/gofeed"
14-
"golang.org/x/exp/slog"
15-
"golang.org/x/net/html"
17+
"github.com/mpvl/unique"
1618
)
1719

1820
type wrappedItem struct {
1921
IsUnread bool
2022
Feed *feed
2123
*gofeed.Item
2224

23-
summary *string
24-
description *string
25-
descriptionImagesForDedupe *[]string
26-
images *[]string
27-
onceDescription sync.Once
25+
summary *string
26+
description *string
27+
images *[]string
28+
onceDescription sync.Once
2829
}
2930

3031
func (w *wrappedItem) MarkReadID() string {
@@ -46,14 +47,9 @@ func (w *wrappedItem) Images() []string {
4647

4748
images := []string{}
4849

49-
// NOTE: we exclude images that already appear in the description (gibiz)
50-
5150
// standard supplied image
5251
if w.Item.Image != nil {
53-
if !w.isDescriptionImage(w.Item.Image.URL) {
54-
// fmt.Println(w.Item.Image.URL)
55-
images = append(images, w.Item.Image.URL)
56-
}
52+
images = append(images, w.Item.Image.URL)
5753
}
5854

5955
// mastodon/gibiz images
@@ -62,11 +58,7 @@ func (w *wrappedItem) Images() []string {
6258
for _, v := range content {
6359
if v.Attrs["medium"] == "image" {
6460
imageURL := v.Attrs["url"]
65-
if !w.isDescriptionImage(imageURL) {
66-
// fmt.Println(w.Description())
67-
// fmt.Printf("%v = %+v\n", k, imageUrl)
68-
images = append(images, imageURL)
69-
}
61+
images = append(images, imageURL)
7062
}
7163
}
7264
}
@@ -91,34 +83,28 @@ func (w *wrappedItem) Images() []string {
9183
}
9284
}
9385

94-
w.images = &images
95-
96-
return *w.images
97-
}
86+
// Now... remove any meta images that are embedded in the description.
87+
// Ignore any query string args.
9888

99-
func (w *wrappedItem) isDescriptionImage(src string) bool {
100-
// strip anything after ? to get rid of query string part
101-
srcNoQueryString := strings.Split(src, "?")[0]
89+
dedupedImages := []string{}
10290

103-
if w.descriptionImagesForDedupe == nil {
104-
// force lazy load if it hasn't already
105-
_ = w.Description()
106-
}
107-
108-
for _, v := range *w.descriptionImagesForDedupe {
109-
// fmt.Println(v, "==", src)
110-
if v == srcNoQueryString {
111-
return true
91+
// Remove any image sources already within the description...
92+
for _, img := range images {
93+
srcNoQueryString := strings.Split(img, "?")[0]
94+
if !strings.Contains(w.Description(), srcNoQueryString) {
95+
dedupedImages = append(dedupedImages, img)
96+
} else {
97+
slog.Info("dedeuped meta image as already found in content", "src", img)
11298
}
11399
}
114100

115-
return false
116-
}
101+
// Remove any internal duplicates within the list...
102+
unique.Strings(&dedupedImages)
117103

118-
var (
119-
tagsToRemoveRe = regexp.MustCompile("script|style|link|meta|iframe|form")
120-
attrsToRemoveRe = regexp.MustCompile("style|class|hx-.*|data-.*|srcset|width|height|sizes|loading|decoding|target")
121-
)
104+
w.images = &dedupedImages
105+
106+
return *w.images
107+
}
122108

123109
func (w *wrappedItem) Description() string {
124110
w.onceDescription.Do(func() {
@@ -156,89 +142,32 @@ func (w *wrappedItem) Description() string {
156142
}
157143
}
158144

159-
// try and sanitise any html
160-
doc, err := html.Parse(strings.NewReader(*desc))
161-
if err != nil {
162-
// failed to sanitise, so just return as is...
163-
slog.Warn("html.Parse failed, returning unsanitised content", "error", err)
145+
// Now simplify the (potential) HTML by converting
146+
// it to and from markdown.
164147

165-
w.description = desc
166-
} else {
167-
w.descriptionImagesForDedupe = &[]string{}
168-
toDelete := []*html.Node{}
169-
170-
var f func(*html.Node)
171-
f = func(n *html.Node) {
172-
// fmt.Println(n)
173-
if n.Type == html.ElementNode {
174-
// fmt.Println(n.Data)
175-
if tagsToRemoveRe.MatchString(n.Data) {
176-
// fmt.Println("removing", n.Data, "tag")
177-
toDelete = append(toDelete, n)
178-
179-
return
180-
}
181-
182-
allowedAttrs := []html.Attribute{}
183-
184-
for i := range n.Attr {
185-
if !attrsToRemoveRe.MatchString(n.Attr[i].Key) {
186-
allowedAttrs = append(allowedAttrs, n.Attr[i])
187-
}
188-
}
189-
190-
n.Attr = allowedAttrs
191-
192-
if n.Data == "a" {
193-
// fmt.Println("making", n.Data, "tag target new tab")
194-
n.Attr = append(n.Attr, html.Attribute{
195-
Namespace: "",
196-
Key: "target",
197-
Val: "_new",
198-
})
199-
// disable href if it starts with #
200-
for i := range n.Attr {
201-
if n.Attr[i].Key == "href" && n.Attr[i].Val[0] == '#' {
202-
n.Attr[i].Key = "xxxhref" // easier than removing the attr
203-
204-
break
205-
}
206-
}
207-
}
208-
209-
if n.Data == "img" || n.Data == "svg" {
210-
// fmt.Println("making", n.Data, "tag style max-width 60%")
211-
n.Attr = append(n.Attr, html.Attribute{
212-
Namespace: "",
213-
Key: "style",
214-
Val: "max-width: 60%;",
215-
})
216-
// keep a note of images so we can de-dupe attached
217-
// images that also appear in the content.
218-
for _, a := range n.Attr {
219-
if a.Key == "src" {
220-
// strip anything after ? to get rid of query string part
221-
bits := strings.Split(a.Val, "?")
222-
*w.descriptionImagesForDedupe = append(*w.descriptionImagesForDedupe, bits[0])
223-
}
224-
}
225-
}
226-
}
148+
// First convert rando HTML to Markdown....
149+
doc, err := htmltomarkdown.ConvertString(*desc)
227150

228-
for c := n.FirstChild; c != nil; c = c.NextSibling {
229-
f(c)
230-
}
231-
}
232-
f(doc)
151+
switch {
152+
case err != nil:
153+
slog.Warn("htmltomarkdown.ConvertString failed, returning unsanitised content", "error", err)
233154

234-
for _, n := range toDelete {
235-
n.Parent.RemoveChild(n)
236-
}
155+
w.description = desc
156+
case doc == "":
157+
slog.Warn("htmltomarkdown.ConvertString result blank, using original.")
237158

238-
renderBuf := bytes.NewBufferString("")
239-
_ = html.Render(renderBuf, doc)
240-
desc := renderBuf.String()
241-
w.description = &desc
159+
w.description = desc
160+
default:
161+
// parse markdown
162+
p := parser.NewWithExtensions(parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock)
163+
md := p.Parse([]byte(doc))
164+
165+
// render to HTML (we choose to exclude embedded images and rely on them being passed in metadata)
166+
renderer := html.NewRenderer(html.RendererOptions{
167+
Flags: html.CommonFlags | html.HrefTargetBlank,
168+
})
169+
mdHTML := string(markdown.Render(md, renderer))
170+
w.description = &mdHTML
242171
}
243172
})
244173

@@ -257,6 +186,8 @@ func (w *wrappedItem) Summary() string {
257186
plainDesc = plainDesc[:maxDescriptionLength]
258187
}
259188

189+
plainDesc = strings.TrimSpace(plainDesc)
190+
260191
// if summary is identical to title return nothing
261192
if plainDesc == w.Title {
262193
plainDesc = ""

0 commit comments

Comments
 (0)