Module: tooter

This module is a bit of a wrapper around the Mastodon.py library to call Mastodon API and fetch toots.

The idea is to create an object, a "Tooter," that encapsulates our conversation with one server. It basically takes a date, a hashtag, and a max. It starts "now" and goes backward in time until it gets a post that is too old or until it hits the max. It does assume that posts are returned in reverse chronological order. That's the only way it makes sense to stop at the first post that is too old.

Create a Tooter object that toots stuff. This is some of the oldest code in the project, and I'm not a great programmer. So some of this is probably a bit sloppy.

Tooter

Bases: Mastodon

Source code in mastoscore/tooter.py
class Tooter(Mastodon):
    credentials: dict = {}
    hostname: str = ""
    files: dict = {}

    def __init__(self, config, phase: str, server: str = ""):
        # Handle debug level - can be string (INFO, DEBUG) or int (10, 20)
        logger = get_logger(config, __name__)

        # Check if server matches api_base_url - if so, authenticate
        api_base_url = config.get("mastoscore", "api_base_url")
        should_authenticate = (server == "" or server == api_base_url)

        if should_authenticate:
            try:
                self.cred_file = config.get(phase, "cred_file")
                logger.debug(f"Logging in from {self.cred_file}")
                super().__init__(
                    access_token=self.cred_file,
                    user_agent=_USER_AGENT,
                    debug_requests=False,
                    feature_set="pleroma",
                    request_timeout=10,
                )
                account = self.me()
                self.id = account["id"]
                self.name = account.username
                self.acct = self.name
                logger.debug(
                    f"Logged in as {account.username} on {self.api_base_url} (uid {account.id})"
                )
            except FileNotFoundError as e:
                raise RuntimeError(f"Credentials file not found: {self.cred_file}. Please create it with your Mastodon access token.") from e
            except Exception as e:
                if "API base URL is required" in str(e):
                    raise RuntimeError(f"Invalid or missing credentials in {self.cred_file}. The file should contain a valid Mastodon access token.") from e
                raise
        else:
            logger.debug(f"Connecting to {server} anonymously")
            super().__init__(
                user_agent=_USER_AGENT,
                debug_requests=False,
                feature_set="pleroma",
                request_timeout=10,
                api_base_url=server,
            )
            logger.debug(f"Connected to {self.api_base_url} (anonymous)")

    def search_hashtag(
        self, hashtag: str, oldest_date: datetime.datetime, max: int = 2000
    ) -> Generator[List[Dict[str, Any]], None, None]:
        """
        Given a hashtag, search the public timeline for that hashtag. Yield pages
        of toots that are newer than oldest_date.

        Args:
            hashtag: String of the hashtag to search for
            oldest_date: Oldest date that we will accept
            max: Maximum number of toots to fetch (default: 2000)

        Returns:
            Yields pages of toots (up to 40 per page) as they are fetched.
        """
        logger = logging.getLogger()

        toots_so_far: int = 0
        # According to https://docs.joinmastodon.org/methods/timelines/#tag
        # max page size, by default, is 40
        pagesize: int = min(max, 40) if max < 40 else 40

        page: List[Dict[str, Any]] = self.timeline_hashtag(hashtag, limit=pagesize)  # type: ignore

        if len(page) == 0:
            logger.error(f"No toots found for hashtag: {hashtag}")
            return

        # Keep fetching toots until we get to one that is older than oldest_date
        # or we get to the end.
        # According to https://docs.joinmastodon.org/api/rate-limits/#per-ip
        # the rate limit is 300 calls in 5 minutes. If we have 2000 toots we have
        # to fetch in pages of 40, that's 50 API calls + some authentication calls
        while len(page) > 0 and toots_so_far < max:
            # check to see if we have reached the oldest allowable toot
            toots_to_keep = [toot for toot in page if toot["created_at"] > oldest_date]
            # Trim to not exceed max
            remaining = max - toots_so_far
            if len(toots_to_keep) > remaining:
                toots_to_keep = toots_to_keep[:remaining]
            toots_so_far += len(toots_to_keep)

            logger.debug(
                f"Added {len(toots_to_keep)} more toots, {toots_so_far} total"
            )

            # Yield this page
            if toots_to_keep:
                yield toots_to_keep

            # if the check ages function returns fewer than we sent to it
            # we have hit the end of the list of toots
            if len(toots_to_keep) < len(page):
                logger.debug("stopping due to age")
                return

            logger.debug(f"Requesting {pagesize} more toots")
            page = self.fetch_next(page)  # type: ignore

        # Loop exited - either no more pages or hit max
        logger.debug(f"Stopping: fetched {toots_so_far} toots")

search_hashtag(hashtag, oldest_date, max=2000)

Given a hashtag, search the public timeline for that hashtag. Yield pages of toots that are newer than oldest_date.

Parameters:

Name Type Description Default
hashtag str

String of the hashtag to search for

required
oldest_date datetime

Oldest date that we will accept

required
max int

Maximum number of toots to fetch (default: 2000)

2000

Returns:

Type Description
None

Yields pages of toots (up to 40 per page) as they are fetched.

Source code in mastoscore/tooter.py
def search_hashtag(
    self, hashtag: str, oldest_date: datetime.datetime, max: int = 2000
) -> Generator[List[Dict[str, Any]], None, None]:
    """
    Given a hashtag, search the public timeline for that hashtag. Yield pages
    of toots that are newer than oldest_date.

    Args:
        hashtag: String of the hashtag to search for
        oldest_date: Oldest date that we will accept
        max: Maximum number of toots to fetch (default: 2000)

    Returns:
        Yields pages of toots (up to 40 per page) as they are fetched.
    """
    logger = logging.getLogger()

    toots_so_far: int = 0
    # According to https://docs.joinmastodon.org/methods/timelines/#tag
    # max page size, by default, is 40
    pagesize: int = min(max, 40) if max < 40 else 40

    page: List[Dict[str, Any]] = self.timeline_hashtag(hashtag, limit=pagesize)  # type: ignore

    if len(page) == 0:
        logger.error(f"No toots found for hashtag: {hashtag}")
        return

    # Keep fetching toots until we get to one that is older than oldest_date
    # or we get to the end.
    # According to https://docs.joinmastodon.org/api/rate-limits/#per-ip
    # the rate limit is 300 calls in 5 minutes. If we have 2000 toots we have
    # to fetch in pages of 40, that's 50 API calls + some authentication calls
    while len(page) > 0 and toots_so_far < max:
        # check to see if we have reached the oldest allowable toot
        toots_to_keep = [toot for toot in page if toot["created_at"] > oldest_date]
        # Trim to not exceed max
        remaining = max - toots_so_far
        if len(toots_to_keep) > remaining:
            toots_to_keep = toots_to_keep[:remaining]
        toots_so_far += len(toots_to_keep)

        logger.debug(
            f"Added {len(toots_to_keep)} more toots, {toots_so_far} total"
        )

        # Yield this page
        if toots_to_keep:
            yield toots_to_keep

        # if the check ages function returns fewer than we sent to it
        # we have hit the end of the list of toots
        if len(toots_to_keep) < len(page):
            logger.debug("stopping due to age")
            return

        logger.debug(f"Requesting {pagesize} more toots")
        page = self.fetch_next(page)  # type: ignore

    # Loop exited - either no more pages or hit max
    logger.debug(f"Stopping: fetched {toots_so_far} toots")