-
Notifications
You must be signed in to change notification settings - Fork 0
/
copy_combined.py
94 lines (81 loc) · 3.71 KB
/
copy_combined.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import shutil
from math import ceil
# Function to count tokens and characters in a file
def count_tokens_and_characters(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
tokens = len(text.split())
characters = len(text)
return tokens, characters
# Function to count special characters in a block
def count_special_characters(block):
special_characters = 0
for char in block:
if char in special_characters_set:
special_characters += 1
return special_characters
# Function to delete characters from the beginning of the file
def delete_characters(file_path, num_characters):
with open(file_path, 'r+', encoding='utf-8') as file:
text = file.read()
file.seek(0)
file.write(text[num_characters:])
file.truncate()
# Function to create a directory based on the parent directory's name
def create_directory_based_on_parent(input_directory):
parent_directory_name = os.path.basename(os.path.normpath(input_directory))
new_directory = os.path.join(input_directory, f"combined_{parent_directory_name}")
os.makedirs(new_directory, exist_ok=True)
return new_directory
# Function to copy chunks of characters to separate files
def copy_chunks_to_files(original_file_path, output_directory, chunk_size=7000):
os.makedirs(output_directory, exist_ok=True)
with open(original_file_path, 'r', encoding='utf-8') as file:
text = file.read()
num_chunks = ceil(len(text) / chunk_size)
for i in range(num_chunks):
start_index = i * chunk_size
end_index = min((i + 1) * chunk_size, len(text))
chunk = text[start_index:end_index]
chunk_file_path = os.path.join(output_directory, f"document_chunked{i+1}.txt")
with open(chunk_file_path, 'w', encoding='utf-8') as chunk_file:
chunk_file.write(chunk)
print(f"{num_chunks} chunks created successfully.")
# Create a temporary directory to hold the combined documents temporarily
temporary_directory = os.path.join(os.getcwd(), "temporary_delete_me")
os.makedirs(temporary_directory, exist_ok=True)
# Write the listing of paths
text_file_paths = []
for root, _, files in os.walk(os.getcwd()): # Walk through the current directory
for file in files:
file_path = os.path.join(root, file)
try:
with open(file_path, "r", encoding="utf-8") as f:
file_content = f.read()
text_file_paths.append(file_path)
except UnicodeDecodeError:
print(f"Skipping non-text file: {file_path}")
# Create a new text document to store the full paths of the copied files
combined_file_name = "combined.txt"
combined_file_path = os.path.join(temporary_directory, combined_file_name)
with open(combined_file_path, "w") as combined:
for path in text_file_paths:
combined.write(path + "\n")
combined.write("===============")
combined.write("\n")
for file_path in text_file_paths:
combined.write(f"\n\n#=#=#=#=#{file_path}#=#=#=#=#\n\n")
with open(file_path, "r", encoding="utf-8") as src_file:
combined.write(src_file.read())
combined.write("\n")
print("Combined document created successfully.")
# Define the output directory for the chunks based on the parent directory
output_directory = create_directory_based_on_parent(os.getcwd())
print("Output Directory:", output_directory)
# Move the combined document to the output directory
shutil.move(combined_file_path, output_directory)
# Copy chunks of approximately 7000 characters to separate files
copy_chunks_to_files(os.path.join(output_directory, combined_file_name), output_directory)
# Remove temporary_delete_me directory
shutil.rmtree(temporary_directory)