diff --git a/.gitignore b/.gitignore
index 14b2842..59501da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,4 @@ __pycache__/
 .cache/
 dist/
 dnc.egg-info/
+tasks/checkpoints/
diff --git a/README.md b/README.md
index ae27460..eaa1466 100644
--- a/README.md
+++ b/README.md
@@ -59,8 +59,9 @@ output, (controller_hidden, memory, read_vectors) = \
 
 The copy task, as descibed in the original paper, is included in the repo.
 
+From the project root:
 ```
-python ./copy_task.py -cuda 0
+python ./tasks/copy_task.py -cuda 0
 ```
 
 ## General noteworthy stuff
diff --git a/dnc/dnc.py b/dnc/dnc.py
index f303a61..18cc3be 100644
--- a/dnc/dnc.py
+++ b/dnc/dnc.py
@@ -69,20 +69,11 @@ def __init__(
     for layer in range(self.num_layers):
       # controllers for each layer
       if self.rnn_type.lower() == 'rnn':
-        if layer == 0:
           self.rnns.append(nn.RNNCell(self.layer0_input_size, self.output_size, bias=self.bias, nonlinearity=self.nonlinearity))
-        else:
-          self.rnns.append(nn.RNNCell(self.layern_input_size, self.output_size, bias=self.bias, nonlinearity=self.nonlinearity))
       elif self.rnn_type.lower() == 'gru':
-        if layer == 0:
           self.rnns.append(nn.GRUCell(self.layer0_input_size, self.output_size, bias=self.bias))
-        else:
-          self.rnns.append(nn.GRUCell(self.layern_input_size, self.output_size, bias=self.bias))
       elif self.rnn_type.lower() == 'lstm':
-        # if layer == 0:
         self.rnns.append(nn.LSTMCell(self.layer0_input_size, self.output_size, bias=self.bias))
-        # else:
-        #   self.rnns.append(nn.LSTMCell(self.layern_input_size, self.output_size, bias=self.bias))
 
       # memories for each layer
       if not self.share_memory:
@@ -170,7 +161,7 @@ def _layer_forward(self, input, layer, hx=(None, None)):
       # the interface vector
       ξ = chx[0] if self.rnn_type.lower() == 'lstm' else chx
       # the output
-      out = self.output_weights(chx[0])
+      out = self.output_weights(chx[0]) if self.rnn_type.lower() == 'lstm' else self.output_weights(chx)
 
       # pass through memory
       if self.share_memory:
diff --git a/setup.py b/setup.py
index d98a37b..f45f0d7 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@
 
     keywords='differentiable neural computer dnc memory network',
 
-    packages=find_packages(exclude=['contrib', 'docs', 'tests']),
+    packages=find_packages(exclude=['contrib', 'docs', 'tests', 'tasks']),
 
     install_requires=['torch', 'numpy'],
 
diff --git a/tasks/__init__.py b/tasks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dnc/copy_task.py b/tasks/copy_task.py
similarity index 96%
rename from dnc/copy_task.py
rename to tasks/copy_task.py
index ac3ccdd..d488d40 100644
--- a/dnc/copy_task.py
+++ b/tasks/copy_task.py
@@ -20,10 +20,11 @@
 
 from torch.nn.utils import clip_grad_norm
 
-from dnc import DNC
+from dnc.dnc import DNC
 
 parser = argparse.ArgumentParser(description='PyTorch Differentiable Neural Computer')
 parser.add_argument('-input_size', type=int, default= 6, help='dimension of input feature')
+parser.add_argument('-rnn_type', type=str, default='lstm', help='type of recurrent cells to use for the controller')
 parser.add_argument('-nhid', type=int, default=64, help='humber of hidden units of the inner nn')
 
 parser.add_argument('-nlayer', type=int, default=2, help='number of layers')
@@ -101,13 +102,6 @@ def criterion(predictions, targets):
   mem_size = args.mem_size
   read_heads = args.read_heads
 
-
-  # options, _ = getopt.getopt(sys.argv[1:], '', ['iterations='])
-
-  # for opt in options:
-  #   if opt[0] == '-iterations':
-  #     iterations = int(opt[1])
-
   rnn = DNC(
     input_size=args.input_size,
     hidden_size=args.nhid,