Module: tooter¶
This module is a bit of a wrapper around the Mastodon.py library to call Mastodon API and fetch toots.
The idea is to create an object, a "Tooter," that encapsulates our conversation with one server. It basically takes a date, a hashtag, and a max. It starts "now" and goes backward in time until it gets a post that is too old or until it hits the max. It does assume that posts are returned in reverse chronological order. That's the only way it makes sense to stop at the first post that is too old.
Create a Tooter object that toots stuff. This is some of the oldest code in the project, and I'm not a great programmer. So some of this is probably a bit sloppy.
Tooter
¶
Bases: Mastodon
Source code in mastoscore/tooter.py
class Tooter(Mastodon):
credentials: dict = {}
hostname: str = ""
files: dict = {}
def __init__(self, config, phase: str, server: str = ""):
# Handle debug level - can be string (INFO, DEBUG) or int (10, 20)
logger = get_logger(config, __name__)
# Check if server matches api_base_url - if so, authenticate
api_base_url = config.get("mastoscore", "api_base_url")
should_authenticate = (server == "" or server == api_base_url)
if should_authenticate:
try:
self.cred_file = config.get(phase, "cred_file")
logger.debug(f"Logging in from {self.cred_file}")
super().__init__(
access_token=self.cred_file,
user_agent=_USER_AGENT,
debug_requests=False,
feature_set="pleroma",
request_timeout=10,
)
account = self.me()
self.id = account["id"]
self.name = account.username
self.acct = self.name
logger.debug(
f"Logged in as {account.username} on {self.api_base_url} (uid {account.id})"
)
except FileNotFoundError as e:
raise RuntimeError(f"Credentials file not found: {self.cred_file}. Please create it with your Mastodon access token.") from e
except Exception as e:
if "API base URL is required" in str(e):
raise RuntimeError(f"Invalid or missing credentials in {self.cred_file}. The file should contain a valid Mastodon access token.") from e
raise
else:
logger.debug(f"Connecting to {server} anonymously")
super().__init__(
user_agent=_USER_AGENT,
debug_requests=False,
feature_set="pleroma",
request_timeout=10,
api_base_url=server,
)
logger.debug(f"Connected to {self.api_base_url} (anonymous)")
def search_hashtag(
self, hashtag: str, oldest_date: datetime.datetime, max: int = 2000
) -> Generator[List[Dict[str, Any]], None, None]:
"""
Given a hashtag, search the public timeline for that hashtag. Yield pages
of toots that are newer than oldest_date.
Args:
hashtag: String of the hashtag to search for
oldest_date: Oldest date that we will accept
max: Maximum number of toots to fetch (default: 2000)
Returns:
Yields pages of toots (up to 40 per page) as they are fetched.
"""
logger = logging.getLogger()
toots_so_far: int = 0
# According to https://docs.joinmastodon.org/methods/timelines/#tag
# max page size, by default, is 40
pagesize: int = min(max, 40) if max < 40 else 40
page: List[Dict[str, Any]] = self.timeline_hashtag(hashtag, limit=pagesize) # type: ignore
if len(page) == 0:
logger.error(f"No toots found for hashtag: {hashtag}")
return
# Keep fetching toots until we get to one that is older than oldest_date
# or we get to the end.
# According to https://docs.joinmastodon.org/api/rate-limits/#per-ip
# the rate limit is 300 calls in 5 minutes. If we have 2000 toots we have
# to fetch in pages of 40, that's 50 API calls + some authentication calls
while len(page) > 0 and toots_so_far < max:
# check to see if we have reached the oldest allowable toot
toots_to_keep = [toot for toot in page if toot["created_at"] > oldest_date]
# Trim to not exceed max
remaining = max - toots_so_far
if len(toots_to_keep) > remaining:
toots_to_keep = toots_to_keep[:remaining]
toots_so_far += len(toots_to_keep)
logger.debug(
f"Added {len(toots_to_keep)} more toots, {toots_so_far} total"
)
# Yield this page
if toots_to_keep:
yield toots_to_keep
# if the check ages function returns fewer than we sent to it
# we have hit the end of the list of toots
if len(toots_to_keep) < len(page):
logger.debug("stopping due to age")
return
logger.debug(f"Requesting {pagesize} more toots")
page = self.fetch_next(page) # type: ignore
# Loop exited - either no more pages or hit max
logger.debug(f"Stopping: fetched {toots_so_far} toots")
search_hashtag(hashtag, oldest_date, max=2000)
¶
Given a hashtag, search the public timeline for that hashtag. Yield pages of toots that are newer than oldest_date.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
hashtag
|
str
|
String of the hashtag to search for |
required |
oldest_date
|
datetime
|
Oldest date that we will accept |
required |
max
|
int
|
Maximum number of toots to fetch (default: 2000) |
2000
|
Returns:
| Type | Description |
|---|---|
None
|
Yields pages of toots (up to 40 per page) as they are fetched. |
Source code in mastoscore/tooter.py
def search_hashtag(
self, hashtag: str, oldest_date: datetime.datetime, max: int = 2000
) -> Generator[List[Dict[str, Any]], None, None]:
"""
Given a hashtag, search the public timeline for that hashtag. Yield pages
of toots that are newer than oldest_date.
Args:
hashtag: String of the hashtag to search for
oldest_date: Oldest date that we will accept
max: Maximum number of toots to fetch (default: 2000)
Returns:
Yields pages of toots (up to 40 per page) as they are fetched.
"""
logger = logging.getLogger()
toots_so_far: int = 0
# According to https://docs.joinmastodon.org/methods/timelines/#tag
# max page size, by default, is 40
pagesize: int = min(max, 40) if max < 40 else 40
page: List[Dict[str, Any]] = self.timeline_hashtag(hashtag, limit=pagesize) # type: ignore
if len(page) == 0:
logger.error(f"No toots found for hashtag: {hashtag}")
return
# Keep fetching toots until we get to one that is older than oldest_date
# or we get to the end.
# According to https://docs.joinmastodon.org/api/rate-limits/#per-ip
# the rate limit is 300 calls in 5 minutes. If we have 2000 toots we have
# to fetch in pages of 40, that's 50 API calls + some authentication calls
while len(page) > 0 and toots_so_far < max:
# check to see if we have reached the oldest allowable toot
toots_to_keep = [toot for toot in page if toot["created_at"] > oldest_date]
# Trim to not exceed max
remaining = max - toots_so_far
if len(toots_to_keep) > remaining:
toots_to_keep = toots_to_keep[:remaining]
toots_so_far += len(toots_to_keep)
logger.debug(
f"Added {len(toots_to_keep)} more toots, {toots_so_far} total"
)
# Yield this page
if toots_to_keep:
yield toots_to_keep
# if the check ages function returns fewer than we sent to it
# we have hit the end of the list of toots
if len(toots_to_keep) < len(page):
logger.debug("stopping due to age")
return
logger.debug(f"Requesting {pagesize} more toots")
page = self.fetch_next(page) # type: ignore
# Loop exited - either no more pages or hit max
logger.debug(f"Stopping: fetched {toots_so_far} toots")