-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_serial_sbatch.py
150 lines (113 loc) · 5.02 KB
/
generate_serial_sbatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# based off previous generate_sbatch.py files developed by zach1502 for assignement 6
import os
import time
import subprocess
import glob
# Compile the C++ code
subprocess.run(["make"], shell=False)
output_dir = "sbatch_files"
os.makedirs(output_dir, exist_ok=True)
sleep_time = 5 # make sleep time global var to change in one spot
STUDENT_ID = "rka120"
ASSIGNMENT_FOLDER = "CMPT_431_Project"
assert STUDENT_ID and ASSIGNMENT_FOLDER, "Please fill in the STUDENT_ID and ASSIGNMENT_FOLDER variables."
commands = [
f"/home/{STUDENT_ID}/{ASSIGNMENT_FOLDER}/knapsack_serial",
]
# chmod the commands
for command in commands:
subprocess.run(["chmod", "u+x", command], check=True)
mpi_processes = [1]
nodes = [1]
iterations = 3
# input_files = ["--fName thousand_item_input.txt", "--fName hundred_thousand_item_input.txt", "--fName ten_million_item_input.txt"]
input_files = ["--fName hundred_thousand_input.txt"]
capacities = ["--capacity 1000", "--capacity 100000", "--capacity 1000000"]
# capacities = ["--capacity 1500"]
max_jobs_per_batch = 4
max_total_cpus = 8
output_dir = "sbatch_files"
os.makedirs(output_dir, exist_ok=True)
def generate_sbatch_content(program, num_processes, num_nodes, iteration, params):
return f"""#!/bin/bash
#SBATCH --nodes={num_nodes}
#SBATCH --ntasks={num_processes}
#SBATCH --cpus-per-task=1
#SBATCH --time=10:00
#SBATCH --mem=5G
#SBATCH --partition=slow
echo "Running {program.split('/')[-1]} with {num_processes} MPI processes on {num_nodes} nodes, input {input_file.split(' ')[-1]} & capacity {cap.split(' ')[-1]}: Iteration {iteration}"
srun {program} {params}
"""
sbatch_files = []
cpu_requests = []
for program in commands:
program_name = program.split('/')[-1]
for input_file in input_files:
for cap in capacities:
params = f"{input_file} {cap}"
# params = curve_area_params if "curve_area" in program_name else heat_transfer_params
for num_processes in mpi_processes:
for num_nodes in nodes:
# Skip invalid combinations
if num_processes < num_nodes:
continue
for iteration in range(1, iterations + 1):
filename = f"test_{program_name}_n{num_processes}_nodes{num_nodes}_iter{iteration}_{cap}.sbatch"
filepath = os.path.join(output_dir, filename)
sbatch_content = generate_sbatch_content(
program, num_processes, num_nodes, iteration, params
)
with open(filepath, 'w') as sbatch_file:
sbatch_file.write(sbatch_content)
sbatch_files.append(filepath)
cpu_requests.append(num_processes)
print(f"Generated {len(sbatch_files)} sbatch files in directory: {output_dir}")
def submit_sbatch(file):
try:
subprocess.run(["sbatch", file], check=True)
print(f"Submitted: {file}")
except subprocess.CalledProcessError as e:
print(f"Failed to submit: {file} with error: {e}")
def check_user_jobs(user_id):
try:
result = subprocess.run(["squeue", "-u", user_id], stdout = subprocess.PIPE, universal_newlines = True)
return len(result.stdout.strip().split("\n")) > 1
except subprocess.CalledProcessError as e:
print(f"Error checking jobs: {e}")
return False
i = 0
while i < len(sbatch_files):
current_batch_jobs = 0
current_batch_cpus = 0
while current_batch_jobs < max_jobs_per_batch and i < len(sbatch_files):
job_cpus = cpu_requests[i]
if current_batch_cpus + job_cpus <= max_total_cpus:
submit_sbatch(sbatch_files[i])
current_batch_jobs += 1
current_batch_cpus += job_cpus
i += 1
else:
# over limit
break
print(f"Submitted {current_batch_jobs} jobs using {current_batch_cpus} CPUs.")
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
while check_user_jobs(STUDENT_ID):
print(f"Waiting for jobs to finish... checking again in {sleep_time} seconds.")
time.sleep(sleep_time)
print("No jobs left. Proceeding to the next batch.")
def combine_slurm_outputs(output_filename="serial_combined_output.txt"):
# Get all slurm output files (slurm-*.out)
slurm_files = glob.glob("slurm-*.out")
if not slurm_files:
print("No slurm output files found.")
return
slurm_files.sort(key=lambda x: int(x.split("-")[1].split(".")[0]))
with open(output_filename, 'w') as combined_file:
for slurm_file in slurm_files:
with open(slurm_file, 'r') as sf:
combined_file.write(f"--- Contents of {slurm_file} ---\n")
combined_file.write(sf.read())
combined_file.write("\n\n")
print(f"Combined all slurm output files into {output_filename}")
combine_slurm_outputs("serial_combined_output.txt")