From ca22f97d95c6c851465d97aa9823ed7925774b3a Mon Sep 17 00:00:00 2001 From: Alex Bair Date: Thu, 5 Dec 2024 09:48:48 -0500 Subject: [PATCH] source-twilio: improve `MessageMedia` incremental sync speed `MessageMedia` was previously checking every message between the config's start date and the present for new media, then filtering out any media created before the cursor value. That made incremental syncs take an extremely long time without any apparent progress; the stream could be searching through the past few years of messages when it usually only needs to search through the past few minutes. This change makes the `MessageMedia` stream only check messages created since the most recent cursor value, falling back to the config's start date if no cursor value is present. This significantly speeds up the connector during incremental syncs. This change also increases the date window size used when fetching a message's media from 1 year to 100 years. This reduces the number of API requests needed when backfilling media records over a year old; instead of requesting a single year of media at a time, the connector essentially requests all of a message's media in one request. It would make more sense to not use a sliding date window strategy for fetching a single message's media, but rewriting the `MessageMedia` stream in a backwards compatible way is a large effort I'd like to avoid, especially when small, targeted changes address the current issue. --- source-twilio/source_twilio/streams.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/source-twilio/source_twilio/streams.py b/source-twilio/source_twilio/streams.py index daee91db13..853f32bdbb 100644 --- a/source-twilio/source_twilio/streams.py +++ b/source-twilio/source_twilio/streams.py @@ -557,17 +557,23 @@ class Messages(IncrementalTwilioStream, TwilioNestedStream): class MessageMedia(IncrementalTwilioStream, TwilioNestedStream): """https://www.twilio.com/docs/sms/api/media-resource#read-multiple-media-resources""" - parent_stream = Messages + parent_stream: type[Messages] = Messages data_field = "media_list" subresource_uri_key = "media" media_exist_validation = {"num_media": "0"} lower_boundary_filter_field = "DateCreated>" upper_boundary_filter_field = "DateCreated<" - cursor_field = "date_created" + cursor_field: str = "date_created" + # Per the Twilio docs on updating Messages, media can't be added to/updated within a message after + # the message is created, so it doesn't make sense to use small date windows to check for a message's media. + # Making slice_step_default (i.e. the window size) really large reduces the number of requests we need + # to get all of a message's media without significantly rewriting how this stream works. + slice_step_default = pendulum.duration(years=100) @cached_property def parent_stream_instance(self): - return self.parent_stream(authenticator=self.authenticator, start_date=self._start_date, lookback_window=self._lookback_window) + most_recent_cursor = self.state.get(self.cursor_field, self._start_date) + return self.parent_stream(authenticator=self.authenticator, start_date=most_recent_cursor, lookback_window=self._lookback_window) class UsageNestedStream(TwilioNestedStream):