From abe7caf18b8f42f4846bd04ee14c0f36543246d5 Mon Sep 17 00:00:00 2001 From: Sirvan Parasteh Date: Thu, 6 Dec 2018 00:58:39 +0330 Subject: [PATCH 1/6] bug fix, MAXSIZE defined in config did not work cause there were no comparison between the media size and max_size ! I just did a minor change and hope it helps. --- .idea/vcs.xml | 6 ++++++ telegram_export/downloader.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 .idea/vcs.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/telegram_export/downloader.py b/telegram_export/downloader.py index bf41cc3..7a53c58 100755 --- a/telegram_export/downloader.py +++ b/telegram_export/downloader.py @@ -80,7 +80,8 @@ def _check_media(self, media): """ Checks whether the given MessageMedia should be downloaded or not. """ - if not media or not self.max_size: + # It is needed to chek the size with the max_size defined in config file + if not media or media.document.size > self.max_size: return False if not self.types: return True From b40ec53f4d78512ade814c4d527cba743a291346 Mon Sep 17 00:00:00 2001 From: Sirvan Parasteh Date: Thu, 6 Dec 2018 01:07:34 +0330 Subject: [PATCH 2/6] Add .idea to the list --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 470f867..dedeaa3 100644 --- a/.gitignore +++ b/.gitignore @@ -85,6 +85,9 @@ target/ # pyenv .python-version +#pycharm +.idea + # celery beat schedule file celerybeat-schedule From 716c24885049cd6334c48c2eba0bae579f7821cf Mon Sep 17 00:00:00 2001 From: Sirvan Parasteh Date: Thu, 6 Dec 2018 01:09:30 +0330 Subject: [PATCH 3/6] remove .idea from the remote repo --- .idea/vcs.xml | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 .idea/vcs.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From 874af0602f451e74795b5de85a408f1a246ac898 Mon Sep 17 00:00:00 2001 From: Sirvan Parasteh Date: Fri, 7 Dec 2018 00:14:59 +0330 Subject: [PATCH 4/6] 2 correction, first file_size is now working, it get compared with maxsize which is defined by user, and in some cases there were a minor bug with filename, it threw error when file names consist os restricted characters like :, I fixed it --- telegram_export/downloader.py | 36 ++++++++++++++++++++++++++--------- telegram_export/utils.py | 18 ++++++++++++++++++ 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/telegram_export/downloader.py b/telegram_export/downloader.py index 7a53c58..3282b0f 100755 --- a/telegram_export/downloader.py +++ b/telegram_export/downloader.py @@ -16,14 +16,12 @@ __log__ = logging.getLogger(__name__) - VALID_TYPES = { 'photo', 'document', 'video', 'audio', 'sticker', 'voice', 'chatphoto' } BAR_FORMAT = "{l_bar}{bar}| {n_fmt}/{total_fmt} " \ "[{elapsed}<{remaining}, {rate_noinv_fmt}{postfix}]" - QUEUE_TIMEOUT = 5 DOWNLOAD_PART_SIZE = 256 * 1024 @@ -40,6 +38,7 @@ class Downloader: Download dialogs and their associated data, and dump them. Make Telegram API requests and sleep for the appropriate time. """ + def __init__(self, client, config, dumper, loop): self.client = client self.loop = loop or asyncio.get_event_loop() @@ -81,11 +80,19 @@ def _check_media(self, media): Checks whether the given MessageMedia should be downloaded or not. """ # It is needed to chek the size with the max_size defined in config file - if not media or media.document.size > self.max_size: + + if not media or not self.max_size: return False + if not self.types: return True - return export_utils.get_media_type(media) in self.types + _, size = export_utils.get_file_location(media) + if export_utils.get_media_type(media) in self.types: + if size and size > self.max_size: + return False + else: + return True + return False def _dump_full_entity(self, entity): """ @@ -222,6 +229,12 @@ async def _download_media(self, media_id, context_id, sender_id, date, 'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name, Size ' 'FROM Media WHERE ID = ?', (media_id,) ).fetchone() + # Check the file with self.max_size to make sure user defined limit will be applied + file_size = media_row[6] + if file_size is None: + return + if file_size > self.max_size: + return # Documents have attributes and they're saved under the "document" # namespace so we need to split it before actually comparing. media_type = media_row[3].split('.') @@ -254,7 +267,11 @@ async def _download_media(self, media_id, context_id, sender_id, date, # Detect a sensible extension from the known mimetype. if not ext: ext = export_utils.get_extension(media_row[4]) - + """ + just to get sure there is no restricted character in the filename such as :<> + """ + if isinstance(filename, str): + filename = export_utils.format_filename(filename) # Apply the date to the user format string and then replace the map formatter['filename'] = filename filename = date.strftime(self.media_fmt).format_map(formatter) @@ -296,6 +313,7 @@ def progress(saved, total): bar.total += media_row[6] self._incomplete_download = filename + await self.client.download_file( location, file=filename, file_size=media_row[6], part_size_kb=DOWNLOAD_PART_SIZE // 1024, @@ -442,9 +460,9 @@ async def start(self, target_id): ) can_get_participants = ( - isinstance(target_in, types.InputPeerChat) - or (isinstance(target, types.Channel) - and (target.megagroup or target.admin_rights is not None)) + isinstance(target_in, types.InputPeerChat) + or (isinstance(target, types.Channel) + and (target.megagroup or target.admin_rights is not None)) ) if can_get_participants: try: @@ -516,7 +534,7 @@ async def start(self, target_id): # the highest ID ("closest" bound we need to reach), stop. if count < req.limit or req.offset_id <= stop_at: __log__.debug('Received less messages than limit, done.') - max_id = self.dumper.get_max_message_id(target_id) or 0 # can't have NULL + max_id = self.dumper.get_max_message_id(target_id) or 0 # can't have NULL self.dumper.save_resume(target_id, stop_at=max_id) break diff --git a/telegram_export/utils.py b/telegram_export/utils.py index 851c888..05c030c 100644 --- a/telegram_export/utils.py +++ b/telegram_export/utils.py @@ -1,5 +1,6 @@ """Utility functions for telegram-export which aren't specific to one purpose""" import mimetypes +import string from telethon.tl import types from urllib.parse import urlparse @@ -259,3 +260,20 @@ def parse_proxy_str(proxy_str): else: proxy = (proxy_type, host, port) return proxy + + +def format_filename(s): + """Take a string and return a valid filename constructed from the string. +Uses a whitelist approach: any characters not present in valid_chars are +removed. Also spaces are replaced with underscores. + +Note: this method may produce invalid filenames such as ``, `.` or `..` +When I use this method I prepend a date string like '2009_01_15_19_46_32_' +and append a file extension like '.txt', so I avoid the potential of using +an invalid filename. + +""" + valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) + filename = ''.join(c for c in s if c in valid_chars) + filename = filename.replace(' ', '_') # I don't like spaces in filenames. + return filename \ No newline at end of file From f2869c37afe65ef9ac45ef34aae2a352d035db18 Mon Sep 17 00:00:00 2001 From: Sirvan Parasteh Date: Fri, 7 Dec 2018 00:22:36 +0330 Subject: [PATCH 5/6] undo changes in gitignore file --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index dedeaa3..470f867 100644 --- a/.gitignore +++ b/.gitignore @@ -85,9 +85,6 @@ target/ # pyenv .python-version -#pycharm -.idea - # celery beat schedule file celerybeat-schedule From 0b0e3c75c1906c1460b93ac17da48adddbffe907 Mon Sep 17 00:00:00 2001 From: Lonami Date: Fri, 7 Dec 2018 10:21:51 +0100 Subject: [PATCH 6/6] Better formatting --- telegram_export/downloader.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/telegram_export/downloader.py b/telegram_export/downloader.py index 3282b0f..b3ca70f 100755 --- a/telegram_export/downloader.py +++ b/telegram_export/downloader.py @@ -79,19 +79,19 @@ def _check_media(self, media): """ Checks whether the given MessageMedia should be downloaded or not. """ - # It is needed to chek the size with the max_size defined in config file - if not media or not self.max_size: return False if not self.types: return True + _, size = export_utils.get_file_location(media) if export_utils.get_media_type(media) in self.types: if size and size > self.max_size: return False else: return True + return False def _dump_full_entity(self, entity): @@ -229,12 +229,11 @@ async def _download_media(self, media_id, context_id, sender_id, date, 'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name, Size ' 'FROM Media WHERE ID = ?', (media_id,) ).fetchone() - # Check the file with self.max_size to make sure user defined limit will be applied + file_size = media_row[6] - if file_size is None: - return - if file_size > self.max_size: + if file_size is None or file_size > self.max_size: return + # Documents have attributes and they're saved under the "document" # namespace so we need to split it before actually comparing. media_type = media_row[3].split('.') @@ -267,11 +266,10 @@ async def _download_media(self, media_id, context_id, sender_id, date, # Detect a sensible extension from the known mimetype. if not ext: ext = export_utils.get_extension(media_row[4]) - """ - just to get sure there is no restricted character in the filename such as :<> - """ + if isinstance(filename, str): filename = export_utils.format_filename(filename) + # Apply the date to the user format string and then replace the map formatter['filename'] = filename filename = date.strftime(self.media_fmt).format_map(formatter)