Skip to content

Commit

Permalink
Merge branch 'first_class_loops' of github.com:spcl/dace into first_c…
Browse files Browse the repository at this point in the history
…lass_loops
  • Loading branch information
phschaad committed Oct 20, 2023
2 parents a935fe5 + 2f3568b commit d84d0e3
Show file tree
Hide file tree
Showing 14 changed files with 265 additions and 231 deletions.
17 changes: 16 additions & 1 deletion dace/codegen/targets/rtl.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.

import itertools

from typing import List, Tuple, Dict
import warnings

from dace import dtypes, config, registry, symbolic, nodes, sdfg, data
from dace.sdfg import graph, state, find_input_arraynode, find_output_arraynode
Expand Down Expand Up @@ -102,6 +102,21 @@ def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: i
elif isinstance(arr, data.Scalar):
line: str = "{} {} = {};".format(dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn,
edge.src.data)
elif isinstance(arr, data.Stream):
# TODO Streams are currently unsupported, as the proper
# behaviour has to be implemented to avoid deadlocking. It
# is only a warning, as the RTL backend is partially used
# by the Xilinx backend, which may hit this case, but will
# discard the errorneous code.
warnings.warn(
'Streams are currently unsupported by the RTL backend.' \
'This may produce errors or deadlocks in the generated code.'
)
line: str = "// WARNING: Unsupported read from ({}) variable '{}' from stream '{}'." \
" This may lead to a deadlock if used in code.\n".format(
dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src_conn)
line += "{} {} = {}.pop();".format(
dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data)
elif isinstance(edge.src, nodes.MapEntry) and isinstance(edge.dst, nodes.Tasklet):
rtl_name = self.unique_name(edge.dst, sdfg.nodes()[state_id], sdfg)
self.n_unrolled[rtl_name] = symbolic.evaluate(edge.src.map.range[0][1] + 1, sdfg.constants)
Expand Down
39 changes: 20 additions & 19 deletions samples/fpga/rtl/add_fortytwo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
#
# This sample shows adding a constant integer value to a stream of integers.
#
# It is intended for running hardware_emulation or hardware xilinx targets.
# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
"""
This sample shows adding a constant integer value to a stream of integers.
It is intended for running hardware_emulation or hardware xilinx targets.
"""

import dace
import numpy as np
Expand Down Expand Up @@ -116,21 +117,21 @@
######################################################################

if __name__ == '__main__':
with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='hardware_emulation'):
# init data structures
N.set(8192)
a = np.random.randint(0, 100, N.get()).astype(np.int32)
b = np.zeros((N.get(), )).astype(np.int32)

# init data structures
N.set(8192)
a = np.random.randint(0, 100, N.get()).astype(np.int32)
b = np.zeros((N.get(), )).astype(np.int32)

# show initial values
print("a={}, b={}".format(a, b))
# show initial values
print("a={}, b={}".format(a, b))

# call program
sdfg(A=a, B=b, N=N)
# call program
sdfg(A=a, B=b, N=N)

# show result
print("a={}, b={}".format(a, b))
# show result
print("a={}, b={}".format(a, b))

# check result
for i in range(N.get()):
assert b[i] == a[i] + 42
# check result
for i in range(N.get()):
assert b[i] == a[i] + 42
13 changes: 8 additions & 5 deletions samples/fpga/rtl/axpy.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
#
# This sample shows the AXPY BLAS routine. It is implemented through Xilinx IPs in order to utilize floating point
# operations. It is intended for running hardware_emulation or hardware xilinx targets.
# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
"""
This sample shows the AXPY BLAS routine. It is implemented through Xilinx IPs in order to utilize floating point
operations.
It is intended for running hardware_emulation or hardware xilinx targets.
"""

import dace
import numpy as np
Expand Down Expand Up @@ -259,4 +262,4 @@ def make_sdfg(veclen=2):
expected = a * x + y
diff = np.linalg.norm(expected - result) / N.get()
print("Difference:", diff)
exit(0 if diff <= 1e-5 else 1)
assert diff <= 1e-5
143 changes: 72 additions & 71 deletions samples/fpga/rtl/axpy_double_pump.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,74 @@
# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
#
# This sample shows the AXPY BLAS routine. It is implemented through Xilinx
# IPs in order to utilize double pumping, which doubles the performance per
# consumed FPGA resource. The double pumping operation is "inwards", which
# means that the internal vectorization width of the core computation is half
# that of the external vectorization width. This translates into utilizing half
# the amount of internal computing resources, compared to a regular vectorized
# implementetation. The block diagram of the design for a 32-bit floating-point
# implementation using vectorization width 2 is:
#
# ap_aclk s_axis_y_in s_axis_x_in a
# │ │ │ │
# │ │ │ │
# │ │ │ │
# ┌───────┼─────────┬────────┼─────────┐ │ │
# │ │ │ │ │ │ │
# │ │ │ ▼ │ ▼ │
# │ │ │ ┌────────────┐ │ ┌────────────┐ │
# │ │ └─►│ │ └─►│ │ │
# │ │ │ Clock sync │ │ Clock sync │ │
# │ │ ┌─►│ │ ┌─►│ │ │
# │ ▼ 300 MHz │ └─────┬──────┘ │ └─────┬──────┘ │
# │ ┌────────────┐ │ │ │ │ │
# │ │ Clock │ │ │ │ │ │
# │ │ │ ├────────┼─────────┤ │ │
# │ │ Multiplier │ │ │ │ │ │
# │ └─────┬──────┘ │ ▼ 64 bit │ ▼ 64 bit │
# │ │ 600 MHz │ ┌────────────┐ │ ┌────────────┐ │
# │ │ │ │ │ │ │ │ │
# │ └─────────┼─►│ Data issue │ └─►│ Data issue │ │
# │ │ │ │ │ │ │
# │ │ └─────┬──────┘ └─────┬──────┘ │
# │ │ │ 32 bit │ 32 bit │
# │ │ │ │ │
# │ │ │ │ │
# │ │ │ ▼ ▼
# │ │ │ ┌────────────┐
# │ │ │ │ │
# │ ├────────┼────────────────►│ Multiplier │
# │ │ │ │ │
# │ │ │ └─────┬──────┘
# │ │ │ │
# │ │ │ ┌──────────────┘
# │ │ │ │
# │ │ ▼ ▼
# │ │ ┌────────────┐
# │ │ │ │
# │ ├─────►│ Adder │
# │ │ │ │
# │ │ └─────┬──────┘
# │ │ │
# │ │ ▼ 32 bit
# │ │ ┌─────────────┐
# │ │ │ │
# │ ├─────►│ Data packer │
# │ │ │ │
# │ │ └─────┬───────┘
# │ │ │ 64 bit
# │ │ ▼
# │ │ ┌────────────┐
# │ └─────►│ │
# │ │ Clock sync │
# └───────────────────────►│ │
# └─────┬──────┘
# │
# ▼
# m_axis_result_out
#
# It is intended for running hardware_emulation or hardware xilinx targets.
# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
"""
This sample shows the AXPY BLAS routine. It is implemented through Xilinx
IPs in order to utilize double pumping, which doubles the performance per
consumed FPGA resource. The double pumping operation is "inwards", which
means that the internal vectorization width of the core computation is half
that of the external vectorization width. This translates into utilizing half
the amount of internal computing resources, compared to a regular vectorized
implementetation. The block diagram of the design for a 32-bit floating-point
implementation using vectorization width 2 is:
ap_aclk s_axis_y_in s_axis_x_in a
│ │ │ │
│ │ │ │
│ │ │ │
┌───────┼─────────┬────────┼─────────┐ │ │
│ │ │ │ │ │ │
│ │ │ ▼ │ ▼ │
│ │ │ ┌────────────┐ │ ┌────────────┐ │
│ │ └─►│ │ └─►│ │ │
│ │ │ Clock sync │ │ Clock sync │ │
│ │ ┌─►│ │ ┌─►│ │ │
│ ▼ 300 MHz │ └─────┬──────┘ │ └─────┬──────┘ │
│ ┌────────────┐ │ │ │ │ │
│ │ Clock │ │ │ │ │ │
│ │ │ ├────────┼─────────┤ │ │
│ │ Multiplier │ │ │ │ │ │
│ └─────┬──────┘ │ ▼ 64 bit │ ▼ 64 bit │
│ │ 600 MHz │ ┌────────────┐ │ ┌────────────┐ │
│ │ │ │ │ │ │ │ │
│ └─────────┼─►│ Data issue │ └─►│ Data issue │ │
│ │ │ │ │ │ │
│ │ └─────┬──────┘ └─────┬──────┘ │
│ │ │ 32 bit │ 32 bit │
│ │ │ │ │
│ │ │ │ │
│ │ │ ▼ ▼
│ │ │ ┌────────────┐
│ │ │ │ │
│ ├────────┼────────────────►│ Multiplier │
│ │ │ │ │
│ │ │ └─────┬──────┘
│ │ │ │
│ │ │ ┌──────────────┘
│ │ │ │
│ │ ▼ ▼
│ │ ┌────────────┐
│ │ │ │
│ ├─────►│ Adder │
│ │ │ │
│ │ └─────┬──────┘
│ │ │
│ │ ▼ 32 bit
│ │ ┌─────────────┐
│ │ │ │
│ ├─────►│ Data packer │
│ │ │ │
│ │ └─────┬───────┘
│ │ │ 64 bit
│ │ ▼
│ │ ┌────────────┐
│ └─────►│ │
│ │ Clock sync │
└───────────────────────►│ │
└─────┬──────┘
m_axis_result_out
It is intended for running hardware_emulation or hardware xilinx targets.
"""

import dace
import numpy as np
Expand Down Expand Up @@ -452,4 +453,4 @@ def make_sdfg(veclen=2):
diff = np.linalg.norm(expected - result) / N.get()
print("Difference:", diff)

exit(0 if diff <= 1e-5 else 1)
assert diff <= 1e-5
17 changes: 9 additions & 8 deletions samples/fpga/rtl/fladd.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
#
# This sample shows how to utilize an IP core in an RTL tasklet. This is done
# through the vector add problem, which adds two floating point vectors
# together.
#
# It is intended for running hardware_emulation or hardware xilinx targets.
# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
"""
This sample shows how to utilize an IP core in an RTL tasklet. This is done
through the vector add problem, which adds two floating point vectors
together.
It is intended for running hardware_emulation or hardware xilinx targets.
"""

import dace
import numpy as np
Expand Down Expand Up @@ -190,4 +191,4 @@
expected = a + b
diff = np.linalg.norm(expected - c) / N.get()
print("Difference:", diff)
exit(0 if diff <= 1e-5 else 1)
assert diff <= 1e-5
41 changes: 21 additions & 20 deletions samples/fpga/rtl/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
#
# This sample shows a DEPTH deep pipeline, where each stage adds 1 to the
# integer input stream.
#
# It is intended for running hardware_emulation or hardware xilinx targets.
# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
"""
This sample shows a DEPTH deep pipeline, where each stage adds 1 to the
integer input stream.
It is intended for running hardware_emulation or hardware xilinx targets.
"""

import dace
import numpy as np
Expand Down Expand Up @@ -151,21 +152,21 @@
######################################################################

if __name__ == '__main__':
with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='hardware_emulation'):
# init data structures
N.set(8192)
a = np.random.randint(0, 100, N.get()).astype(np.int32)
b = np.zeros((N.get(), )).astype(np.int32)

# init data structures
N.set(8192)
a = np.random.randint(0, 100, N.get()).astype(np.int32)
b = np.zeros((N.get(), )).astype(np.int32)

# show initial values
print("a={}, b={}".format(a, b))
# show initial values
print("a={}, b={}".format(a, b))

# call program
sdfg(A=a, B=b, N=N)
# call program
sdfg(A=a, B=b, N=N)

# show result
print("a={}, b={}".format(a, b))
# show result
print("a={}, b={}".format(a, b))

# check result
for i in range(N.get()):
assert b[i] == a[i] + depth
# check result
for i in range(N.get()):
assert b[i] == a[i] + depth
Loading

0 comments on commit d84d0e3

Please sign in to comment.