Skip to content

Commit

Permalink
fixed file descriptor leak due to keepalives, probably introduced by …
Browse files Browse the repository at this point in the history
…urllib3 2.0
  • Loading branch information
fazalmajid committed May 20, 2023
1 parent f23f0b0 commit 3e545b9
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 42 deletions.
10 changes: 7 additions & 3 deletions tembozapp/autodiscovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
from . import param

def find(url):
html = requests.get(url,
headers={'user-agent': param.user_agent},
timeout=param.http_timeout).content
s = requests.Session()
try:
html = s.get(url,
headers={'user-agent': param.user_agent},
timeout=param.http_timeout).content
finally:
s.close()
tree = html5lib.parse(html, namespaceHTMLElements=False)
# base for relative URLs
base = tree.findall('.//base')
Expand Down
8 changes: 6 additions & 2 deletions tembozapp/degunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,12 @@ def apply(self, content, *args, **kwargs):
item['link'] = link[0]
return content
# we haven't seen this article before, buck up and load it
deref = requests.get(item['link'],
timeout=param.http_timeout).content
s = requests.Session()
try:
deref = s.get(item['link'],
timeout=param.http_timeout).content
finally:
s.close()
m = self.re.search(deref)
if m and m.groups():
item['link'] = m.groups()[0]
Expand Down
8 changes: 6 additions & 2 deletions tembozapp/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,12 @@ def link_extract(link_text, content):

def dereference_content(url):
try:
r = requests.get(url, timeout=param.http_timeout)
return r.content
s = requests.Session()
try:
r = s.get(url, timeout=param.http_timeout)
return r.content
finally:
s.close()
except:
return ''

Expand Down
42 changes: 23 additions & 19 deletions tembozapp/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,26 +549,30 @@ def dereference(url, seen=None, level=0, jar=None):
if level > 16:
return url
try:
r = requests.get(url, allow_redirects=False, timeout=param.http_timeout,
cookies=jar)
if not r.is_redirect:
return url
else:
jar.update(r.cookies)
# break a redirection loop if it occurs
redir = r.headers.get('Location')
if True not in [redir.startswith(p)
for p in ['http://', 'https://', 'ftp://']]:
return url
if redir in seen:
return url
# some servers redirect to Unicode URLs, which are not legal
try:
str(redir)
except UnicodeDecodeError:
s = requests.Session()
try:
r = s.get(url, allow_redirects=False, timeout=param.http_timeout,
cookies=jar)
if not r.is_redirect:
return url
# there might be several levels of redirection
return dereference(redir, seen, level + 1, jar)
else:
jar.update(r.cookies)
# break a redirection loop if it occurs
redir = r.headers.get('Location')
if True not in [redir.startswith(p)
for p in ['http://', 'https://', 'ftp://']]:
return url
if redir in seen:
return url
# some servers redirect to Unicode URLs, which are not legal
try:
str(redir)
except UnicodeDecodeError:
return url
# there might be several levels of redirection
return dereference(redir, seen, level + 1, jar)
finally:
s.close()
except (requests.exceptions.RequestException, ValueError, socket.error):
return url
except:
Expand Down
60 changes: 44 additions & 16 deletions tembozapp/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,14 @@ def add_feed(feed_xml):
c = db.cursor()
feed_xml = feed_xml.replace('feed://', 'http://')
# verify the feed
r = requests.get(feed_xml, timeout=param.http_timeout)
f = feedparser.parse(r.content)
s = requests.Session()
try:
r = s.get(feed_xml, timeout=param.http_timeout)
content = r.content
etag = r.headers.get('Etag')
finally:
s.close()
f = feedparser.parse(content)
normalize.basic(f, feed_xml)
if not f.feed or ('link' not in f.feed or 'title' not in f.feed):
original = feed_xml
Expand All @@ -60,19 +66,25 @@ def add_feed(feed_xml):
raise AutoDiscoveryError
print('add_feed:autodiscovery of', original, 'found', feed_xml,
file=param.log)
r = requests.get(feed_xml, timeout=param.http_timeout)
f = feedparser.parse(r.text)
s = requests.Session()
try:
r = s.get(feed_xml, timeout=param.http_timeout)
text = r.text
etag = r.headers.get('Etag')
finally:
s.close()
f = feedparser.parse(text)
normalize.basic(f, feed_xml)
if not f.feed or 'url' not in f:
print('add_feed:autodiscovery failed %r %r' % (r.text, f.__dict__),
print('add_feed:autodiscovery failed %r %r' % (text, f.__dict__),
file=param.log)
raise ParseError
# we have a valid feed, normalize it
normalize.normalize_feed(f)
feed = {
'xmlUrl': f['url'],
'htmlUrl': str(f.feed['link']),
'etag': r.headers.get('Etag'),
'etag': etag,
'title': f.feed['title'],
'desc': f.feed['description']
}
Expand Down Expand Up @@ -100,8 +112,13 @@ def update_feed_xml(feed_uid, feed_xml):
"""Update a feed URL and fetch the feed. Returns the number of new items"""
feed_uid = int(feed_uid)

r = requests.get(feed_xml, timeout=param.http_timeout)
f = feedparser.parse(r.content)
s = requests.Session()
try:
r = s.get(feed_xml, timeout=param.http_timeout)
content = r.content
finally:
s.close()
f = feedparser.parse(content)
if not f.feed:
raise ParseError
normalize.normalize_feed(f)
Expand Down Expand Up @@ -279,7 +296,7 @@ def dedupe(feed_uid):
select * from fm_items i2
where i2.item_feed_uid=fm_items.item_feed_uid
and i2.item_uid<>fm_items.item_uid
and i2.item_title=fm_items.item_title and i2.item_rating<>0
and i2.item_title=fm_items.item_title
)""", [feed_uid])
modified = c.rowcount
db.commit()
Expand All @@ -306,8 +323,13 @@ def purge_reload(feed_uid):
where feed_uid=?""", [feed_uid])
feed_xml = c.fetchone()[0]
db.commit()
r = requests.get(feed_xml, timeout=param.http_timeout)
f = feedparser.parse(r.content)
s = requests.Session()
try:
r = s.get(feed_xml, timeout=param.http_timeout)
content = r.content
finally:
s.close()
f = feedparser.parse(content)
if not f.feed:
raise ParseError
normalize.normalize_feed(f)
Expand Down Expand Up @@ -360,12 +382,18 @@ def fetch_feed(feed_uid, feed_xml, feed_etag, feed_modified):
if not feed_modified:
feed_modified = None
try:
r = requests.get(feed_xml, headers={
'If-None-Match': feed_etag
}, timeout=param.http_timeout)
if r.content == '':
s = requests.Session()
try:
r = s.get(feed_xml, headers={
'If-None-Match': feed_etag
}, timeout=param.http_timeout)
content = r.content
etag = r.headers.get('Etag')
finally:
s.close()
if content == '':
return {'channel': {}, 'items': [], 'why': 'no change since Etag'}
f = feedparser.parse(r.content, etag=r.headers.get('Etag'),
f = feedparser.parse(content, etag=etag,
modified=feed_modified)
except (socket.timeout, requests.exceptions.RequestException) as e:
if param.debug:
Expand Down

0 comments on commit 3e545b9

Please sign in to comment.