-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmat_mult_block.py
135 lines (115 loc) · 4.5 KB
/
mat_mult_block.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from mpi4py import MPI
import numpy as np
from matmult_funcs import *
import time as t
world = MPI.COMM_WORLD
rank = world.Get_rank()
nProcs = world.Get_size()
# Default Values
iProcs = 1
jProcs = 1
# sizeA = np.zeros((2,1),dtype='d')
# sizeB = np.zeros((2,1),dtype='d')
sizeA = [0, 0]
sizeB = [0, 0]
# Global Initialisation
if (rank==0):
# init_matrices is defined such that it takes two arrays for size of A and B respectively and two more arguments for lowest and highest random number to be generated. Default for low is 0 and high is 1
sizeA = [250,100]
sizeB = [100,150]
A,B = init_matrices(sizeA,sizeB)
C = np.zeros((sizeA[0],sizeB[1]),dtype='d')
iProcs,jProcs = factor_procs(nProcs,sizeA,sizeB)
# print(iProcs,jProcs)
# time_taken = 0
# Communication of the decomposition
sizeA = world.bcast(sizeA,root=0)
sizeB = world.bcast(sizeB,root=0)
iProcs = world.bcast(iProcs,root=0)
jProcs = world.bcast(jProcs,root=0)
# print(sizeA,sizeB,iProcs,jProcs,rank)
if (rank==0):
t1 = t.time()
# Local Initialisation
sizeC = [sizeA[0],sizeB[1]] # not necessary, but present for clarity
blockSize = [sizeC[0]/iProcs,sizeC[1]/jProcs]
blockLength = sizeA[1] # by now, it will be the same as sizeB[0]
# Here, blockSize stores the number of rows of A and cols of B to be received and blockLength stores the length of the block
local_A_rows = np.zeros((blockSize[0],blockLength),dtype='d')
local_B_cols = np.zeros((blockLength,blockSize[1]),dtype='d')
local_C_block = np.zeros((blockSize[0],blockSize[1]),dtype='d')
# print(blockSize,rank)
# Communication based on decomposition
# Rank 0 is behaving as the master and it communicates various parts of matrix A and B to the different processors, keeping aside some for itself
if (rank==0):
i_indices = np.linspace(0,sizeC[0],iProcs+1)
j_indices = np.linspace(0,sizeC[1],jProcs+1)
# print(i_indices)
# print(j_indices)
for i_proc in range(0,iProcs):
for j_proc in range(0,jProcs):
proc_id = i_proc * jProcs + j_proc
if proc_id == 0:
local_A_rows = A[i_indices[i_proc]:i_indices[i_proc+1],:]
local_B_cols = B[:,j_indices[j_proc]:j_indices[j_proc+1]]
# print(local_B_cols)
else:
send_tag_A = 100+proc_id
send_tag_B = 200+proc_id
world.Send([A[i_indices[i_proc]:i_indices[i_proc+1],:],MPI.DOUBLE],dest=proc_id,tag=send_tag_A)
# print(B)
# print(B[:,j_indices[j_proc]:j_indices[j_proc+1]])
world.Send([np.ascontiguousarray(B[:,j_indices[j_proc]:j_indices[j_proc+1]]),MPI.DOUBLE],dest=proc_id,tag=send_tag_B)
else:
world.Recv(local_A_rows,source=0,tag=100+rank)
world.Recv(local_B_cols,source=0,tag=200+rank)
# if rank == 0:
# print(A)
# else:
# print(local_A_rows,rank)
#
# if rank == 0:
# print(B)
# else:
# print(local_B_cols,rank)
# Calculation
# # using numpy
# local_C_block = np.dot(local_A_rows,local_B_cols)
# conventional loops
for i in range(blockSize[0]):
for j in range(blockSize[1]):
for k in range(blockLength):
local_C_block[i][j] = local_C_block[i][j] + local_A_rows[i][k]*local_B_cols[k][j]
# Communication to master
# Each processor with rank "rank" is actually I,J th processor in the processor grid
# I = int(rank/jProcs)
# J = rank%jProcs
# Processor (I,J) will communicate block (I,J) to master. The start and end indices are given as
# C_I = I*sizeC[0] to (I+1)*sizeC[0]
# C_J = J*sizeC[1] to (J+1)*sizeC[1]
# proc_grid_i = int(rank/jProcs)
# proc_grid_j = rank%jProcs
# proc_grid_id = proc_grid_i*jProcs+proc_grid_j ; this is rank!
if rank != 0:
world.Send([local_C_block,MPI.DOUBLE],dest=0,tag=300+rank)
else:
C[rank*blockSize[0]:(rank+1)*blockSize[0],rank*blockSize[1]:(rank+1)*blockSize[1]] = local_C_block
for i_proc in range(0,iProcs):
for j_proc in range(0,jProcs):
proc_id = i_proc * jProcs + j_proc
if proc_id != 0:
recv_tag_C = 300 + proc_id
world.Recv(local_C_block,source=proc_id,tag=recv_tag_C)
C[i_proc*blockSize[0]:(i_proc+1)*blockSize[0],j_proc*blockSize[1]:(j_proc+1)*blockSize[1]] = local_C_block
# print(C)
if rank == 0:
t2 = t.time()
time_taken = t2-t1
if rank == 0:
C_act = np.dot(A,B)
# print(C_act,"actual")
print(np.amax(np.abs(C-C_act)))
if rank == 0:
print(time_taken,"secs in",nProcs,"processors.")
# print("Done",rank)
# Future: Add compatibility for non-equal last bits to incorporate 3 processors, say