Skip to content

Commit

Permalink
Merge pull request #161 from bentasker/instance_banlist
Browse files Browse the repository at this point in the history
feat: add support for instance banlist
  • Loading branch information
nanos authored Sep 2, 2024
2 parents 058643b + ed40ff3 commit fe6ce1a
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ Option | Required? | Notes |
|:----------------------------------------------------|-----------|:------|
|`access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called `ACCESS_TOKEN`. If running as a cron job or a container, you can supply this option as array, to [fetch posts for multiple users](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher) on your instance. To set tokens for multiple users using environment variables, define multiple environment variables with `FF_ACCESS_TOKEN` prefix, eg. `FF_ACCESS_TOKEN_USER1=…` and `FF_ACCESS_TOKEN_USER2=…`|
|`server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. |
|`instance-blocklist` | No | A comma seperated list of instance domains that FediFetcher should never attempt to connect to.
|`home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. Recommended value: `200`.
| `max-bookmarks` | No | Provide to fetch remote replies to any posts you have bookmarked. Determines how many of your bookmarks you want to get replies to. Recommended value: `80`. Requires an access token with `read:bookmarks` scope.
| `max-favourites` | No | Provide to fetch remote replies to any posts you have favourited. Determines how many of your favourites you want to get replies to. Recommended value: `40`. Requires an access token with `read:favourites` scope.
Expand Down
9 changes: 8 additions & 1 deletion find_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.")
argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)")
argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format")
argparser.add_argument('--instance-blocklist', required=False, type=str, default="",help="A comma-seperated array of instances that FediFetcher should never try to connect to")

def get_notification_users(server, access_token, known_users, max_age):
since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age)
Expand Down Expand Up @@ -1120,6 +1121,10 @@ def can_fetch(user_agent, url):
parsed_uri = urlparse(url)
robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)

if parsed_uri.netloc in INSTANCE_BLOCKLIST:
# Never connect to these locations
raise Exception(f"Connecting to {parsed_uri.netloc} is prohibited by the configured blocklist")

robotsTxt = get_robots_from_url(robots_url)
if isinstance(robotsTxt, bool):
return robotsTxt
Expand Down Expand Up @@ -1501,7 +1506,8 @@ def fetch_timeline_context(timeline_posts, token, parsed_urls, seen_hosts, seen_
"on_done",
"on_fail",
"log_level",
"log_format"
"log_format",
"instance_blocklist"
]:
value = int(value)
setattr(arguments, envvar, value)
Expand Down Expand Up @@ -1572,6 +1578,7 @@ def fetch_timeline_context(timeline_posts, token, parsed_urls, seen_hosts, seen_
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')

INSTANCE_BLOCKLIST = [x.strip() for x in arguments.instance_blocklist.split(",")]
ROBOTS_TXT = {}

seen_urls = OrderedSet([])
Expand Down
1 change: 1 addition & 0 deletions tests/test_find_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -1446,6 +1446,7 @@ def test_can_fetch(mock_robotFileParser, mock_get_robots_from_url):
# Prepare mocks
mock_robotsTxt = MagicMock()
mock_robotParser = MagicMock()
find_posts.INSTANCE_BLOCKLIST = []

# Mock return values
mock_get_robots_from_url.return_value = mock_robotsTxt
Expand Down

0 comments on commit fe6ce1a

Please sign in to comment.