diff --git a/appveyor.yml b/appveyor.yml
index dc0445dac8..7d89dec958 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -85,11 +85,12 @@ install:
 - cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 appveyor DownloadFile https://dl.google.com/android/repository/android-ndk-r19c-windows-x86_64.zip
 - cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 7z x android-ndk-r19c-windows-x86_64.zip -oC:\ndk
 - cmd: IF %NAME%==android set PATH=C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64\bin;%PATH%
+- cmd: IF EXIST C:\cache\OpenBLAS\ rd /s /q C:\cache\OpenBLAS
 - cmd: IF %NAME%==android sed "s/clang+*/&.cmd/" cross-files/aarch64-linux-android >crossfile-aarch64
-- cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-aarch64 appveyor DownloadFile https://github.com/borg323/OpenBLAS/releases/download/android-0.3.8-2/openblas-android-aarch64.zip
+- cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-aarch64 appveyor DownloadFile https://github.com/borg323/OpenBLAS/releases/download/android-0.3.27/openblas-android-aarch64.zip
 - cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-aarch64 7z x openblas-android-aarch64.zip -oC:\cache\OpenBLAS
 - cmd: IF %NAME%==android sed "s/clang+*/&.cmd/" cross-files/armv7a-linux-android >crossfile-armv7a
-- cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-armv7a appveyor DownloadFile https://github.com/borg323/OpenBLAS/releases/download/android-0.3.8-2/openblas-android-armv7a.zip
+- cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-armv7a appveyor DownloadFile https://github.com/borg323/OpenBLAS/releases/download/android-0.3.27/openblas-android-armv7a.zip
 - cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-armv7a 7z x openblas-android-armv7a.zip -oC:\cache\OpenBLAS
 - cmd: set PKG_FOLDER="C:\cache"
 - cmd: IF NOT EXIST c:\cache mkdir c:\cache
diff --git a/dist/install-dml.cmd b/dist/install-dml.cmd
index 099f42958c..ca93411a55 100644
--- a/dist/install-dml.cmd
+++ b/dist/install-dml.cmd
@@ -2,14 +2,7 @@
 where /q tar
 if errorlevel 1 goto error
 
-where /q lc0.exe
-if errorlevel 1 cd /d %~dp0
-where /q lc0.exe
-if errorlevel 1 (
-  echo This script must run in the lc0 folder.
-  pause
-  exit /b
-)
+cd /d %~dp0
 
 cls
 echo Installing the DirectML.dll version required by the Lc0 onnx-dml backend.
diff --git a/meson.build b/meson.build
index ef0e0afcdf..519bd98fc5 100644
--- a/meson.build
+++ b/meson.build
@@ -490,6 +490,22 @@ if get_option('build_backends')
       nvcc_extra_args = ['-arch=compute_' + cuda_cc, '-code=sm_' + cuda_cc]
     elif get_option('native_cuda') and nvcc_help.contains('-arch=native')
       nvcc_extra_args = ['-arch=native']
+    elif nvcc_help.contains('-arch=all-major')
+      nvcc_extra_args = ['-arch=all-major', '-Wno-deprecated-gpu-targets']
+    else
+      nvcc_extra_args = ['-Wno-deprecated-gpu-targets']
+      # Fallback for cuda versions without -arch=all-major.
+      foreach x : ['35', '50', '60', '70', '80']
+        if nvcc_help.contains('sm_' + x)
+          nvcc_extra_args += '-gencode=arch=compute_' + x + ',code=sm_' + x
+        endif
+      endforeach
+      # For forward compatibility.
+      if nvcc_help.contains('sm_80') # Cuda 11+
+        nvcc_extra_args += '-gencode=arch=compute_80,code=compute_80'
+      elif nvcc_help.contains('sm_75') # Cuda 10+
+        nvcc_extra_args += '-gencode=arch=compute_75,code=compute_75'
+      endif
     endif
     foreach x : get_option('cudnn_include')
       cuda_arguments += ['-I', x]
@@ -507,35 +523,6 @@ if get_option('build_backends')
       command : [nvcc, nvcc_extra_args, cuda_arguments]
     )
 
-    # Handling of fp16 cuda code: If nvcc_extra_args is empty add options to
-    # generate code for the major fp16 capable architectures.
-    if nvcc_extra_args == []
-      nvcc_arch = '-arch=compute_70'
-      nvcc_sm_list = ['sm_70', 'sm_75', 'sm_80', 'sm_90']
-      if host_machine.system() != 'windows'
-        nvcc_arch = '-arch=compute_60'
-        nvcc_sm_list += ['sm_60']
-        if ['arm', 'aarch64'].contains(host_machine.cpu_family())
-          message('Compiling for Jetson.')
-          nvcc_arch = '-arch=compute_53'
-          nvcc_sm_list = ['sm_53', 'sm_62', 'sm_72', 'sm_87']
-        endif
-      endif
-      nvcc_extra_args = [nvcc_arch]
-      foreach x : nvcc_sm_list
-        if nvcc_help.contains(x)
-          nvcc_extra_args += '-code=' + x
-        endif
-      endforeach
-      # For forward compatibility.
-      if nvcc_help.contains('sm_90') # Cuda 12+
-        nvcc_extra_args += '-gencode=arch=compute_90,code=compute_90'
-      elif nvcc_help.contains('sm_80') # Cuda 11+
-        nvcc_extra_args += '-gencode=arch=compute_80,code=compute_80'
-      elif nvcc_help.contains('sm_75') # Cuda 10+
-        nvcc_extra_args += '-gencode=arch=compute_75,code=compute_75'
-      endif
-    endif
     files += custom_target('cuda fp16 code',
       input : 'src/neural/cuda/fp16_kernels.cu',
       output : outputname,
@@ -644,6 +631,7 @@ if get_option('build_backends')
         'src/neural/xla/xla_tensor.cc',
       ]
       deps += cc.find_library('dl', required: false)
+      has_backends = true
   endif
 
 endif # if get_option('build_backends')
diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/cuda/network_cuda.cc
index cba3a0cc0b..cc44f1a531 100644
--- a/src/neural/cuda/network_cuda.cc
+++ b/src/neural/cuda/network_cuda.cc
@@ -670,7 +670,8 @@ class CudaNetwork : public Network {
     stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
 
     if (allow_cache_opt_ && use_res_block_winograd_fuse_opt_ &&
-        (res_block_mem <= scratch_size_) && (res_block_mem <= l2_cache_size_)) {
+        (static_cast<size_t>(res_block_mem) <= scratch_size_) &&
+        (res_block_mem <= l2_cache_size_)) {
       // we can use a single alloc to hold all the required tensors, and enable
       // persistent L2 caching on it
       ReportCUDAErrors(cudaStreamSetAttribute(
diff --git a/src/neural/cuda/network_cudnn.cc b/src/neural/cuda/network_cudnn.cc
index dc39906fc4..7129595c89 100644
--- a/src/neural/cuda/network_cudnn.cc
+++ b/src/neural/cuda/network_cudnn.cc
@@ -938,7 +938,7 @@ class CudnnNetwork : public Network {
   std::unique_ptr<InputsOutputs> GetInputsOutputs() {
     std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
     if (free_inputs_outputs_.empty()) {
-      return std::make_unique<InputsOutputs>(max_batch_size_, wdl_, false,
+      return std::make_unique<InputsOutputs>(max_batch_size_, wdl_,
                                              moves_left_);
     } else {
       std::unique_ptr<InputsOutputs> resource =
diff --git a/src/neural/xla/network_xla.cc b/src/neural/xla/network_xla.cc
index ff9ff7413b..1652ff817e 100644
--- a/src/neural/xla/network_xla.cc
+++ b/src/neural/xla/network_xla.cc
@@ -311,7 +311,7 @@ std::unique_ptr<Network> MakeXlaNetwork(const std::optional<WeightsFile>& w,
                                       w->format().network_format());
 }
 
-REGISTER_NETWORK("xla", MakeXlaNetwork, -34)
+REGISTER_NETWORK("xla", MakeXlaNetwork, 34)
 
 }  // namespace
 }  // namespace lczero