-
Notifications
You must be signed in to change notification settings - Fork 0
/
pptx_helper.py
591 lines (503 loc) · 20 KB
/
pptx_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
import logging
import pathlib
import re
import tempfile
from typing import List, Tuple
import json5
import pptx
from global_config import GlobalConfig
PATTERN = re.compile(r"^slide[ ]+\d+:", re.IGNORECASE)
SAMPLE_JSON_FOR_PPTX = '''
{
"title": "Understanding AI",
"slides": [
{
"heading": "Introduction",
"bullet_points": [
"Brief overview of AI",
[
"Importance of understanding AI"
]
]
}
]
}
'''
logging.basicConfig(
level=GlobalConfig.LOG_LEVEL,
format='%(asctime)s - %(message)s',
)
def remove_slide_number_from_heading(header: str) -> str:
"""
Remove the slide number from a given slide header.
:param header: The header of a slide
"""
if PATTERN.match(header):
idx = header.find(':')
header = header[idx + 1:]
return header
def find_split_point_org(flat_items_list, max_chars_per_slide):
"""Find the best split point around the character threshold, preferably at level=1."""
current_char_count = 0
for i, (text, level) in enumerate(flat_items_list):
current_char_count += len(text)
if current_char_count > max_chars_per_slide:
# Look for a level=1 item around the threshold
for j in range(max(0, i-3), min(len(flat_items_list), i+3)):
if flat_items_list[j][1] == 1:
return j
return i
return len(flat_items_list)
def find_split_point(flat_items_list, max_chars_per_slide):
"""Find the best split point around the character threshold, preferably at level=1."""
min_chars_per_slide = max_chars_per_slide/4
current_char_count = 0
last_level1_index = -1
last_count_at_level1 = -1
last_level1_text = ""
last_level2_index = -1
last_count_at_level2 = -1
for i, (text, level) in enumerate(flat_items_list):
current_char_count += len(text)
if level==0:
last_level1_index = i
last_count_at_level1 = current_char_count
last_level1_text = text
print(f"Last level 1 found at: {last_level1_index} with count: {last_count_at_level1}")
#print(f"text = {text}")
elif level==1:
last_level2_index = i
last_count_at_level2 = current_char_count
#print(f"Last level 2 found at: {last_level2_index} with count: {last_count_at_level2}")
#print(f"text = {text}")
#
if current_char_count > max_chars_per_slide:
print(f"Overflow detected for index: {i}, text = {text}, level = {level}")
if last_level1_index != -1 and last_count_at_level1 > min_chars_per_slide:
# we return the last seen level 1 item for the splitting point
print(f"we return the last seen level 1 item for the splitting point: {last_level1_index} with text: {last_level1_text}")
return last_level1_index
elif last_level2_index != -1 and last_count_at_level2 > min_chars_per_slide:
# splitting at level 1 would result in 'too small' slide, but level 2 seems ok
print(f"we return the last seen level 2 item for the splitting point: {last_level2_index}")
return last_level2_index
#
# no good level 1 or 2 split. Might as well split right here
return i
#
#
return len(flat_items_list)
def split_slide_content(flat_items_list, max_chars_per_slide):
"""Split content into multiple slides if it exceeds the maximum characters per slide."""
slides_content = []
while flat_items_list:
split_point = find_split_point(flat_items_list, max_chars_per_slide)
slides_content.append(flat_items_list[:split_point])
flat_items_list = flat_items_list[split_point:]
return slides_content
def cleanup_slides_data(data_string):
data_dict = None
try:
data_dict = json5.loads(data_string) # Convert JSON string to a dictionary
except Exception as e:
ee = f"[cleanup_slides_data] Error parsing JSON data: {e} with data = {data_string}"
print(ee)
return data_string
#
if "slides" not in data_dict:
return data_string
#
slides = data_dict["slides"] # Access the slides array
#print("[cleanup_slides_data] slides = ", slides)
if isinstance(slides, str):
try:
print("Parsing JSON string for slides")
slides = json5.loads(slides) # Convert JSON string to a dictionary
except Exception as e:
ee = f"[cleanup_slides_data] Error parsing JSON data: {e} with data = {slides}"
print(ee)
return data_string
#
#
# check if slides is an object or an array
if isinstance(slides, dict):
print("slides is a dictionary")
return data_string
elif isinstance(slides, list):
print("slides is a list")
else:
print("Slides is something else: ", type(slides))
#
processed_slides = [] # Initialize an empty list to store the processed elements
for element in slides:
element_string = json5.dumps(element)
if isinstance(element, str):
continue # Skip if the element is a string
elif isinstance(element, dict):
processed_slides.append(element) # Keep if the element is an object
elif isinstance(element, list):
print("Breaking up a list" + str(element))
for item in element:
if isinstance(item, dict):
processed_slides.append(item) # Insert each object from the array into the processed list
#
#
#
#
data_dict["slides"] = processed_slides # Update the slides array with the processed elements
return json5.dumps(data_dict, indent=4) # Convert dictionary back to JSON string and return it
#
def generate_powerpoint_presentation(structured_data: str, slides_template: str, output_file_path: pathlib.Path, max_chars_per_slide: int = 1000) -> List:
"""
Create and save a PowerPoint presentation file containing the content in JSON format.
:param structured_data: The presentation contents as "JSON" (may contain trailing commas)
:param slides_template: The PPTX template to use
:param output_file_path: The path of the PPTX file to save as
:param max_chars_per_slide: Maximum number of characters allowed per slide before splitting
:return A list of presentation title and slides headers
"""
BAD_STRING = "This is an example response from ChatGPT.,"
structured_data = structured_data.replace(BAD_STRING, "")
structured_data = cleanup_slides_data(structured_data)
# The structured "JSON" might contain trailing commas, so using json5
parsed_data = None
try:
parsed_data = json5.loads(structured_data)
except Exception as e:
ee = f"Error parsing JSON5 data: {e} with data = {structured_data}"
logging.error(ee)
raise ValueError(ee)
#
config = GlobalConfig()
logging.debug(
"*** Using PPTX template: %s",
config.PPTX_TEMPLATE_FILES[slides_template]['file']
)
presentation = pptx.Presentation(config.PPTX_TEMPLATE_FILES[slides_template]['file'])
# The title slide
if len(presentation.slides) > 0:
for i in range(len(presentation.slides)-1, -1, -1):
rId = presentation.slides._sldIdLst[i].rId
presentation.part.drop_rel(rId)
del presentation.slides._sldIdLst[i]
#
#
title_slide_layout = presentation.slide_layouts[0]
slide = presentation.slides.add_slide(title_slide_layout)
title = slide.shapes.title
subtitle = slide.placeholders[1]
title.text = parsed_data['title']
logging.debug('Presentation title is: %s', title.text)
if 'subtitle' in parsed_data:
subtitle.text = parsed_data['subtitle']
logging.debug('Presentation subtitle is: %s', subtitle.text)
else:
subtitle.text = ''
#
all_headers = [title.text, ]
slides_parsed_data = parsed_data.get('slides', None)
if slides_parsed_data is None:
error_message = 'No slides found in the parsed data'
logging.error('error_message')
print(error_message)
return []
#
new_slides_data = []
for a_slide in slides_parsed_data:
flat_items_list = get_flat_list_of_contents(a_slide['bullet_points'], level=0)
slides_content = split_slide_content(flat_items_list, max_chars_per_slide)
for i, slide_content in enumerate(slides_content):
if i == 0:
slide_heading = a_slide['heading']
else:
slide_heading = f"{a_slide['heading']} (continued {i})" if i > 1 else f"{a_slide['heading']} (continued)"
slide_data = {
'heading': slide_heading,
'bullet_points': slide_content
}
if 'type' in a_slide: slide_data["type"] = a_slide['type']
new_slides_data.append(slide_data)
#
#
for slide_data in new_slides_data:
if "type" in slide_data and slide_data["type"] == "sectionheader":
bullet_slide_layout = presentation.slide_layouts[2]
else:
bullet_slide_layout = presentation.slide_layouts[1]
slide = presentation.slides.add_slide(bullet_slide_layout)
shapes = slide.shapes
title_shape = shapes.title
body_shape = shapes.placeholders[1]
title_shape.text = remove_slide_number_from_heading(slide_data['heading'])
all_headers.append(title_shape.text)
text_frame = body_shape.text_frame
for text, level in slide_data['bullet_points']:
paragraph = text_frame.add_paragraph()
while text:
hyperlink_start = text.find('https://') if 'https://' in text else text.find('http://')
if hyperlink_start == -1:
paragraph.add_run().text = text
break
# Add text before hyperlink
if hyperlink_start > 0:
paragraph.add_run().text = text[:hyperlink_start]
text = text[hyperlink_start:]
# Add hyperlink
hyperlink_end = text.find(' ', hyperlink_start)
if hyperlink_end == -1:
hyperlink_url = text
text = ''
else:
hyperlink_url = text[:hyperlink_end]
text = text[hyperlink_end + 1:]
run = paragraph.add_run()
run.text = hyperlink_url
hlink = run.hyperlink
hlink.address = hyperlink_url
paragraph.level = level
presentation.save(output_file_path)
return all_headers
def generate_powerpoint_presentation_advanced(
structured_data: str,
slides_template: str,
output_file_path: pathlib.Path
) -> List:
"""
Create and save a PowerPoint presentation file containing the content in JSON format.
:param structured_data: The presentation contents as "JSON" (may contain trailing commas)
:param slides_template: The PPTX template to use
:param output_file_path: The path of the PPTX file to save as
:return A list of presentation title and slides headers
"""
# The structured "JSON" might contain trailing commas, so using json5
parsed_data = None
try:
parsed_data = json5.loads(structured_data)
except Exception as e:
print("Error parsing JSON5 data: ", e)
print("structured_data = ", structured_data)
return []
#
print("--------")
print("parsed_data = ", parsed_data)
print("--------")
data_slides = []
data_title = ""
data_subtitle = ""
if 'slides' in parsed_data:
data_slides = parsed_data['slides']
else:
print(f"[ERROR] No slides in parsed_data: f{parsed_data}")
#
if 'title' in parsed_data: data_title = parsed_data['title']
if 'subtitle' in parsed_data: data_subtitle = parsed_data['subtitle']
print("==============")
print("data_slides = ", data_slides)
print("==============")
print(f"data_title = {data_title}, data_subtitle = {data_subtitle}")
print("==============")
slides = json5.loads(data_slides)
config = GlobalConfig()
logging.debug(
"*** Using PPTX template: %s",
config.PPTX_TEMPLATE_FILES[slides_template]['file']
)
presentation = pptx.Presentation(config.PPTX_TEMPLATE_FILES[slides_template]['file'])
# The title slide
pptx_slide = None
if len(presentation.slides) > 0:
for i in range(len(presentation.slides)-1, -1, -1):
rId = presentation.slides._sldIdLst[i].rId
presentation.part.drop_rel(rId)
del presentation.slides._sldIdLst[i]
#
#
title_slide_layout = presentation.slide_layouts[0]
pptx_slide = presentation.slides.add_slide(title_slide_layout)
pptx_title = pptx_slide.shapes.title
pptx_title.text = data_title
pptx_subtitle = pptx_slide.placeholders[1]
pptx_subtitle.text = data_subtitle
all_headers = [pptx_title.text, ]
print("all_headers = ", all_headers)
# Add contents in a loop
lettered = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
for a_slide in slides:
numbered_indexes = [0,0,0,0,0]
lettered_indexes = [0,0,0,0,0]
print("------------------")
print(f"slide = {a_slide}")
heading = ""
bullet_points = []
section_type = ""
is_section_header = False
if "heading" in a_slide: heading = a_slide['heading']
if "bullet_points" in a_slide: bullet_points = a_slide['bullet_points']
if "type" in a_slide: section_type = a_slide['type']
if section_type == "sectionheader":
is_section_header = True
print("This is a section header")
bullet_slide_layout = presentation.slide_layouts[2]
else:
bullet_slide_layout = presentation.slide_layouts[1]
#
current_slide = presentation.slides.add_slide(bullet_slide_layout)
current_shapes = current_slide.shapes
title_shape = current_shapes.title
body_shape = current_shapes.placeholders[1]
title_shape.text = remove_slide_number_from_heading(heading)
all_headers.append(title_shape.text)
text_frame = body_shape.text_frame
if len(bullet_points) > 0:
# The bullet_points is a flat array with indentation level explicitly set
# We create bullets, numbers and letters 'bullets' manually
previous_numbered_level = -1
previous_lettered_level = -1
for an_item in bullet_points:
bullet_type = "none"
bullet_level = 0
bullet_text = ""
if "bullet_type" in an_item: bullet_type = an_item["bullet_type"]
if "bullet_level" in an_item: bullet_level = int(an_item["bullet_level"])
if "bullet_text" in an_item: bullet_text = an_item["bullet_text"]
indentation = bullet_level
if bullet_type == "bullet":
bullet_text = "• " + bullet_text
elif bullet_type == "number":
if previous_numbered_level < bullet_level: numbered_indexes[bullet_level] = 0
bullet_text = f"{numbered_indexes[bullet_level] + 1}. " + bullet_text
previous_numbered_level = bullet_level
numbered_indexes[bullet_level] += 1
#indentation += 2
elif bullet_type == "letter":
if previous_lettered_level < bullet_level: lettered_indexes[bullet_level] = 0
bullet_text = f"{lettered[lettered_indexes[bullet_level]]}. " + bullet_text
lettered_indexes[bullet_level] += 1
previous_lettered_level = bullet_level
#indentation += 3
#
print(f"bullet_type = {bullet_type}, bullet_level = {bullet_level}, bullet_text = {bullet_text}, indentation = {indentation}")
paragraph = text_frame.add_paragraph()
paragraph.text = bullet_text
paragraph.level = indentation
#
else:
# create a blank bullet point
paragraph = text_frame.add_paragraph()
paragraph.text = ""
paragraph.level = 0
#
#
presentation.save(output_file_path)
return all_headers
#
def get_flat_list_of_contents(items: list, level: int) -> List[Tuple]:
"""
Flatten a (hierarchical) list of bullet points to a single list containing each item and
its level.
:param items: A bullet point (string or list)
:param level: The current level of hierarchy
:return: A list of (bullet item text, hierarchical level) tuples
"""
flat_list = []
for item in items:
if isinstance(item, str):
flat_list.append((item, level))
elif isinstance(item, list):
flat_list = flat_list + get_flat_list_of_contents(item, level + 1)
return flat_list
if __name__ == '__main__':
# bullets = [
# 'Description',
# 'Types',
# [
# 'Type A',
# 'Type B'
# ],
# 'Grand parent',
# [
# 'Parent',
# [
# 'Grand child'
# ]
# ]
# ]
# output = get_flat_list_of_contents(bullets, level=0)
# for x in output:
# print(x)
json_data = '''
{
"title": "Understanding AI",
"slides": [
{
"heading": "Introduction",
"bullet_points": [
"Brief overview of AI",
[
"Importance of understanding AI"
]
]
},
{
"heading": "What is AI?",
"bullet_points": [
"Definition of AI",
[
"Types of AI",
[
"Narrow or weak AI",
"General or strong AI"
]
],
"Differences between AI and machine learning"
]
},
{
"heading": "How AI Works",
"bullet_points": [
"Overview of AI algorithms",
[
"Types of AI algorithms",
[
"Rule-based systems",
"Decision tree systems",
"Neural networks"
]
],
"How AI processes data"
]
},
{
"heading": "Pros of AI",
"bullet_points": [
"Increased efficiency and productivity",
"Improved accuracy and precision",
"Enhanced decision-making capabilities",
"Personalized experiences"
]
},
{
"heading": "Cons of AI",
"bullet_points": [
"Job displacement and loss of employment",
"Bias and discrimination",
"Privacy and security concerns",
"Dependence on technology"
]
},
{
"heading": "Future Prospects of AI",
"bullet_points": [
"Advancements in fields such as healthcare and finance",
"Increased use"
]
}
]
}'''
temp = tempfile.NamedTemporaryFile(delete=False, suffix='.pptx')
path = pathlib.Path(temp.name)
generate_powerpoint_presentation(
json5.loads(json_data),
output_file_path=path,
slides_template='Blank'
)