diff --git a/Project.toml b/Project.toml
index 026a29ba7..32d95d792 100644
--- a/Project.toml
+++ b/Project.toml
@@ -75,7 +75,7 @@ Reexport = "1.2"
 RuntimeGeneratedFunctions = "0.5.12"
 SafeTestsets = "0.1"
 SciMLBase = "2.28"
-Statistics = "1.10"
+Statistics = "1.11"
 SymbolicUtils = "1.5, 2, 3"
 Symbolics = "5.27.1, 6"
 Test = "1"
diff --git a/src/PDE_BPINN.jl b/src/PDE_BPINN.jl
index 0bf18c4f0..b03f15894 100644
--- a/src/PDE_BPINN.jl
+++ b/src/PDE_BPINN.jl
@@ -4,6 +4,7 @@ mutable struct PDELogTargetDensity{
     P <: Vector{<:Distribution},
     I,
     F,
+    FF,
     PH
 }
     dim::Int64
@@ -15,17 +16,19 @@ mutable struct PDELogTargetDensity{
     extraparams::Int
     init_params::I
     full_loglikelihood::F
+    L2_loss2::FF
     Φ::PH
 
     function PDELogTargetDensity(dim, strategy, dataset,
             priors, allstd, names, extraparams,
-            init_params::AbstractVector, full_loglikelihood, Φ)
+            init_params::AbstractVector, full_loglikelihood, L2_loss2, Φ)
         new{
             typeof(strategy),
             typeof(dataset),
             typeof(priors),
             typeof(init_params),
             typeof(full_loglikelihood),
+            typeof(L2_loss2),
             typeof(Φ)
         }(dim,
             strategy,
@@ -36,18 +39,20 @@ mutable struct PDELogTargetDensity{
             extraparams,
             init_params,
             full_loglikelihood,
+            L2_loss2,
             Φ)
     end
     function PDELogTargetDensity(dim, strategy, dataset,
             priors, allstd, names, extraparams,
             init_params::Union{NamedTuple, ComponentArrays.ComponentVector},
-            full_loglikelihood, Φ)
+            full_loglikelihood, L2_loss2, Φ)
         new{
             typeof(strategy),
             typeof(dataset),
             typeof(priors),
             typeof(init_params),
             typeof(full_loglikelihood),
+            typeof(L2_loss2),
             typeof(Φ)
         }(dim,
             strategy,
@@ -58,15 +63,85 @@ mutable struct PDELogTargetDensity{
             extraparams,
             init_params,
             full_loglikelihood,
+            L2_loss2,
             Φ)
     end
 end
 
+# you get a vector of losses
+function get_lossy(pinnrep, dataset, Dict_differentials)
+    eqs = pinnrep.eqs
+    depvars = pinnrep.depvars #depvar order is same as dataset
+
+    # Dict_differentials is filled with Differential operator => diff_i key-value pairs
+    # masking operation
+    eqs_new = substitute.(eqs, Ref(Dict_differentials))
+
+    to_subs, tobe_subs = get_symbols(dataset, depvars, eqs)
+
+    # for values of all depvars at corresponding indvar values in dataset, create dictionaries {Dict(x(t) => 1.0496435863173237, y(t) => 1.9227770685615337)}
+    # In each Dict, num form of depvar is key to its value at certain coords of indvars, n_dicts = n_rows_dataset(or n_indvar_coords_dataset)
+    eq_subs = [Dict(tobe_subs[depvar] => to_subs[depvar][i] for depvar in depvars)
+               for i in 1:size(dataset[1][:, 1])[1]]
+
+    # for each dataset point(eq_sub dictionary), substitute in masked equations
+    # n_collocated_equations = n_rows_dataset(or n_indvar_coords_dataset)
+    masked_colloc_equations = [[substitute(eq, eq_sub) for eq in eqs_new]
+                            for eq_sub in eq_subs]
+    # now we have vector of dataset depvar's collocated equations
+
+    # reverse dict for re-substituting values of Differential(t)(u(t)) etc
+    rev_Dict_differentials = Dict(value => key for (key, value) in Dict_differentials)
+
+    # unmask Differential terms in masked_colloc_equations
+    colloc_equations = [substitute.(masked_colloc_equation, Ref(rev_Dict_differentials))
+                    for masked_colloc_equation in masked_colloc_equations]
+
+    # nested vector of datafree_pde_loss_functions (as in discretize.jl)
+    # each sub vector has dataset's indvar coord's datafree_colloc_loss_function, n_subvectors = n_rows_dataset(or n_indvar_coords_dataset)
+    # zip each colloc equation with args for each build_loss call per equation vector
+    datafree_colloc_loss_functions = [[build_loss_function(pinnrep, eq, pde_indvar)
+                       for (eq, pde_indvar, integration_indvar) in zip(colloc_equation,
+        pinnrep.pde_indvars,
+        pinnrep.pde_integration_vars)] for colloc_equation in colloc_equations]
+
+    return datafree_colloc_loss_functions
+end
+
+function get_symbols(dataset, depvars, eqs)
+    # take only values of depvars from dataset
+    depvar_vals = [dataset_i[:, 1] for dataset_i in dataset]
+    # order of pinnrep.depvars, depvar_vals, BayesianPINN.dataset must be same
+    to_subs = Dict(depvars .=> depvar_vals)
+
+    numform_vars = Symbolics.get_variables.(eqs)
+    Eq_vars = unique(reduce(vcat, numform_vars))
+    # got equation's depvar num format {x(t)} for use in substitute()
+
+    tobe_subs = Dict()
+    for a in depvars
+        for i in Eq_vars
+            expr = toexpr(i)
+            if (expr isa Expr) && (expr.args[1] == a)
+                tobe_subs[a] = i
+            end
+        end
+    end
+    # depvar symbolic and num format got, tobe_subs : Dict{Any, Any}(:y => y(t), :x => x(t))
+
+    return to_subs, tobe_subs
+end
+
 function LogDensityProblems.logdensity(Tar::PDELogTargetDensity, θ)
     # for parameter estimation neccesarry to use multioutput case
-    return Tar.full_loglikelihood(setparameters(Tar, θ),
-               Tar.allstd) + priorlogpdf(Tar, θ) + L2LossData(Tar, θ)
-    # + L2loss2(Tar, θ)
+    if Tar.L2_loss2 isa Nothing
+        return Tar.full_loglikelihood(setparameters(Tar, θ), Tar.allstd) +
+               priorlogpdf(Tar, θ) + L2LossData(Tar, θ)
+    else
+        return Tar.full_loglikelihood(setparameters(Tar, θ), Tar.allstd) +
+               priorlogpdf(Tar, θ) + L2LossData(Tar, θ) +
+               Tar.L2_loss2(setparameters(Tar, θ), Tar.allstd)
+    end
 end
 
 function setparameters(Tar::PDELogTargetDensity, θ)
@@ -112,6 +187,8 @@ function L2LossData(Tar::PDELogTargetDensity, θ)
 
     # dataset of form Vector[matrix_x, matrix_y, matrix_z]
     # matrix_i is of form [i,indvar1,indvar2,..] (needed in case if heterogenous domains)
+    # note that indvar1,indvar2.. cols can be different values for different depvar matrices
+    # dataset,phi order follows pinnrep.depvars orders of variables (order of declaration in @variables macro)
 
     # Phi is the trial solution for each NN in chain array
     # Creating logpdf( MvNormal(Phi(t,θ),std), dataset[i] )
@@ -157,27 +234,6 @@ function priorlogpdf(Tar::PDELogTargetDensity, θ)
     return logpdf(nnwparams, θ)
 end
 
-function integratorchoice(Integratorkwargs, initial_ϵ)
-    Integrator = Integratorkwargs[:Integrator]
-    if Integrator == JitteredLeapfrog
-        jitter_rate = Integratorkwargs[:jitter_rate]
-        Integrator(initial_ϵ, jitter_rate)
-    elseif Integrator == TemperedLeapfrog
-        tempering_rate = Integratorkwargs[:tempering_rate]
-        Integrator(initial_ϵ, tempering_rate)
-    else
-        Integrator(initial_ϵ)
-    end
-end
-
-function adaptorchoice(Adaptor, mma, ssa)
-    if Adaptor != AdvancedHMC.NoAdaptation()
-        Adaptor(mma, ssa)
-    else
-        AdvancedHMC.NoAdaptation()
-    end
-end
-
 function inference(samples, pinnrep, saveats, numensemble, ℓπ)
     domains = pinnrep.domains
     phi = pinnrep.phi
@@ -242,6 +298,27 @@ function inference(samples, pinnrep, saveats, numensemble, ℓπ)
     return ensemblecurves, estimatedLuxparams, estimated_params, timepoints
 end
 
+function integratorchoice(Integratorkwargs, initial_ϵ)
+    Integrator = Integratorkwargs[:Integrator]
+    if Integrator == JitteredLeapfrog
+        jitter_rate = Integratorkwargs[:jitter_rate]
+        Integrator(initial_ϵ, jitter_rate)
+    elseif Integrator == TemperedLeapfrog
+        tempering_rate = Integratorkwargs[:tempering_rate]
+        Integrator(initial_ϵ, tempering_rate)
+    else
+        Integrator(initial_ϵ)
+    end
+end
+
+function adaptorchoice(Adaptor, mma, ssa)
+    if Adaptor != AdvancedHMC.NoAdaptation()
+        Adaptor(mma, ssa)
+    else
+        AdvancedHMC.NoAdaptation()
+    end
+end
+
 """
     ahmc_bayesian_pinn_pde(pde_system, discretization;
             draw_samples = 1000,
@@ -290,15 +367,56 @@ end
 function ahmc_bayesian_pinn_pde(pde_system, discretization;
         draw_samples = 1000,
         bcstd = [0.01], l2std = [0.05],
-        phystd = [0.05], priorsNNw = (0.0, 2.0),
+        phystd = [0.05], phystdnew = [0.05], priorsNNw = (0.0, 2.0),
         param = [], nchains = 1, Kernel = HMC(0.1, 30),
         Adaptorkwargs = (Adaptor = StanHMCAdaptor,
             Metric = DiagEuclideanMetric, targetacceptancerate = 0.8),
         Integratorkwargs = (Integrator = Leapfrog,), saveats = [1 / 10.0],
-        numensemble = floor(Int, draw_samples / 3), progress = false, verbose = false)
+        numensemble = floor(Int, draw_samples / 3), Dict_differentials = nothing,
+        progress = false, verbose = false)
     pinnrep = symbolic_discretize(pde_system, discretization)
     dataset_pde, dataset_bc = discretization.dataset
 
+    newloss = if Dict_differentials isa Nothing
+        nothing
+    else
+        datafree_colloc_loss_functions = get_lossy(pinnrep, dataset_pde, Dict_differentials)
+        # equals number of indvar coords in dataset
+        # add case for if parameters present in bcs?
+
+        train_sets_pde = get_dataset_train_points(pde_system.eqs,
+                dataset_pde,
+                pinnrep)
+        colloc_train_sets = [[hcat(train_sets_pde[i][:, j]...)' for i in eachindex(datafree_colloc_loss_functions[1])] for j in eachindex(datafree_colloc_loss_functions)]
+
+        # for each datafree_colloc_loss_function create loss_functions by passing dataset's indvar coords as train_sets_pde.
+        # placeholder strategy = GridTraining(0.1), datafree_bc_loss_function and train_sets_bc must be nothing
+        # order of indvar coords will be same as corresponding depvar coords values in dataset provided in get_lossy() call.
+        pde_loss_function_points = [merge_strategy_with_loglikelihood_function(
+            pinnrep,
+            GridTraining(0.1),
+            datafree_colloc_loss_functions[i],
+            nothing;
+            train_sets_pde = colloc_train_sets[i],
+            train_sets_bc = nothing)[1]
+                              for i in eachindex(datafree_colloc_loss_functions)]
+
+        function L2_loss2(θ, allstd)
+            stdpdesnew = allstd[4]
+
+            # first vector of losses,from tuple -> pde losses, first[1] pde loss
+            pde_loglikelihoods = [sum([pde_loss_function(θ, stdpdesnew[i])
+                                       for (i, pde_loss_function) in enumerate(pde_loss_functions)])
+                                  for pde_loss_functions in pde_loss_function_points]
+
+            # bc_loglikelihoods = [sum([bc_loss_function(θ, stdpdesnew[i]) for (i, bc_loss_function) in enumerate(pde_loss_function_points[1])]) for pde_loss_function_points in pde_loss_functions]
+            #                      for (j, bc_loss_function) in enumerate(bc_loss_functions)]
+
+            return sum(pde_loglikelihoods)
+        end
+    end
+
+    # [WIP] add overall functionality for BC dataset points (case of parametric BC)
     if ((dataset_bc isa Nothing) && (dataset_pde isa Nothing))
         dataset = nothing
     elseif dataset_bc isa Nothing
@@ -328,9 +446,6 @@ function ahmc_bayesian_pinn_pde(pde_system, discretization;
     # NN solutions for loglikelihood which is used for L2lossdata
     Φ = pinnrep.phi
 
-    # for new L2 loss
-    # discretization.additional_loss = 
-
     if nchains < 1
         throw(error("number of chains must be greater than or equal to 1"))
     end
@@ -356,7 +471,7 @@ function ahmc_bayesian_pinn_pde(pde_system, discretization;
     # append Ode params to all paramvector - initial_θ
     if ninv > 0
         # shift ode params(initialise ode params by prior means)
-        # check if means or user speified is better
+        # check if means or user specified is better
         initial_θ = vcat(initial_θ, [Distributions.params(param[i])[1] for i in 1:ninv])
         priors = vcat(priors, param)
         nparameters += ninv
@@ -370,11 +485,12 @@ function ahmc_bayesian_pinn_pde(pde_system, discretization;
         strategy,
         dataset,
         priors,
-        [phystd, bcstd, l2std],
+        [phystd, bcstd, l2std, phystdnew],
         names,
         ninv,
         initial_nnθ,
         full_weighted_loglikelihood,
+        newloss,
         Φ)
 
     Adaptor, Metric, targetacceptancerate = Adaptorkwargs[:Adaptor],
@@ -389,10 +505,14 @@ function ahmc_bayesian_pinn_pde(pde_system, discretization;
             ℓπ.allstd))
     @info("Current Prior Log-likelihood : ", priorlogpdf(ℓπ, initial_θ))
     @info("Current MSE against dataset Log-likelihood : ", L2LossData(ℓπ, initial_θ))
+    if !(newloss isa Nothing)
+        @info("Current L2_LOSSY : ",
+            ℓπ.L2_loss2(setparameters(ℓπ, initial_θ),
+                ℓπ.allstd))
+    end
 
     # parallel sampling option
     if nchains != 1
-
         # Cache to store the chains
         bpinnsols = Vector{Any}(undef, nchains)
 
@@ -448,6 +568,11 @@ function ahmc_bayesian_pinn_pde(pde_system, discretization;
         @info("Current Prior Log-likelihood : ", priorlogpdf(ℓπ, samples[end]))
         @info("Current MSE against dataset Log-likelihood : ",
             L2LossData(ℓπ, samples[end]))
+        if !(newloss isa Nothing)
+            @info("Current L2_LOSSY : ",
+                ℓπ.L2_loss2(setparameters(ℓπ, samples[end]),
+                    ℓπ.allstd))
+        end
 
         fullsolution = BPINNstats(mcmc_chain, samples, stats)
         ensemblecurves, estimnnparams, estimated_params, timepoints = inference(samples,
diff --git a/src/discretize.jl b/src/discretize.jl
index 9a40e0fe8..7eb6c97af 100644
--- a/src/discretize.jl
+++ b/src/discretize.jl
@@ -512,29 +512,30 @@ function SciMLBase.symbolic_discretize(pde_system::PDESystem,
         bc_indvars,
         bc_integration_vars)]
 
-    pde_loss_functions, bc_loss_functions = merge_strategy_with_loss_function(pinnrep,
-        strategy,
-        datafree_pde_loss_functions,
-        datafree_bc_loss_functions)
-    # setup for all adaptive losses
-    num_pde_losses = length(pde_loss_functions)
-    num_bc_losses = length(bc_loss_functions)
-    # assume one single additional loss function if there is one. this means that the user needs to lump all their functions into a single one,
-    num_additional_loss = additional_loss isa Nothing ? 0 : 1
-
-    adaloss_T = eltype(adaloss.pde_loss_weights)
-
-    # this will error if the user has provided a number of initial weights that is more than 1 and doesn't match the number of loss functions
-    adaloss.pde_loss_weights = ones(adaloss_T, num_pde_losses) .* adaloss.pde_loss_weights
-    adaloss.bc_loss_weights = ones(adaloss_T, num_bc_losses) .* adaloss.bc_loss_weights
-    adaloss.additional_loss_weights = ones(adaloss_T, num_additional_loss) .*
-                                      adaloss.additional_loss_weights
+    function get_likelihood_estimate_function(discretization::PhysicsInformedNN)
+        pde_loss_functions, bc_loss_functions = merge_strategy_with_loss_function(pinnrep,
+            strategy,
+            datafree_pde_loss_functions,
+            datafree_bc_loss_functions)
+        # setup for all adaptive losses
+        num_pde_losses = length(pde_loss_functions)
+        num_bc_losses = length(bc_loss_functions)
+        # assume one single additional loss function if there is one. this means that the user needs to lump all their functions into a single one,
+        num_additional_loss = additional_loss isa Nothing ? 0 : 1
+
+        adaloss_T = eltype(adaloss.pde_loss_weights)
+
+        # this will error if the user has provided a number of initial weights that is more than 1 and doesn't match the number of loss functions
+        adaloss.pde_loss_weights = ones(adaloss_T, num_pde_losses) .*
+                                   adaloss.pde_loss_weights
+        adaloss.bc_loss_weights = ones(adaloss_T, num_bc_losses) .* adaloss.bc_loss_weights
+        adaloss.additional_loss_weights = ones(adaloss_T, num_additional_loss) .*
+                                          adaloss.additional_loss_weights
 
     reweight_losses_func = generate_adaptive_loss_function(pinnrep, adaloss,
         pde_loss_functions,
         bc_loss_functions)
 
-    function get_likelihood_estimate_function(discretization::PhysicsInformedNN)
         function full_loss_function(θ, p)
             # the aggregation happens on cpu even if the losses are gpu, probably fine since it's only a few of them
             pde_losses = [pde_loss_function(θ) for pde_loss_function in pde_loss_functions]
@@ -612,42 +613,67 @@ function SciMLBase.symbolic_discretize(pde_system::PDESystem,
             return full_weighted_loss
         end
 
-        return full_loss_function
+        return bc_loss_functions, pde_loss_functions, full_loss_function
     end
 
     function get_likelihood_estimate_function(discretization::BayesianPINN)
+        # Because separate reweighting code section needed and loglikelihood is pointwise independent
+        pde_loss_functions, bc_loss_functions = merge_strategy_with_loglikelihood_function(
+            pinnrep,
+            strategy,
+            datafree_pde_loss_functions,
+            datafree_bc_loss_functions)
+
+        # setup for all adaptive losses
+        num_pde_losses = length(pde_loss_functions)
+        num_bc_losses = length(bc_loss_functions)
+        # assume one single additional loss function if there is one. this means that the user needs to lump all their functions into a single one,
+        num_additional_loss = additional_loss isa Nothing ? 0 : 1
+
+        adaloss_T = eltype(adaloss.pde_loss_weights)
+
+        # this will error if the user has provided a number of initial weights that is more than 1 and doesn't match the number of loss functions
+        adaloss.pde_loss_weights = ones(adaloss_T, num_pde_losses) .*
+                                   adaloss.pde_loss_weights
+        adaloss.bc_loss_weights = ones(adaloss_T, num_bc_losses) .* adaloss.bc_loss_weights
+        adaloss.additional_loss_weights = ones(adaloss_T, num_additional_loss) .*
+                                          adaloss.additional_loss_weights
+
+        reweight_losses_func = generate_adaptive_loss_function(pinnrep, adaloss,
+            pde_loss_functions,
+            bc_loss_functions)
+
         dataset_pde, dataset_bc = discretization.dataset
+        dataset_pde = dataset_pde isa Nothing ? dataset_pde : get_dataset_train_points(eqs, dataset_pde, pinnrep)
+        dataset_bc = dataset_bc isa Nothing ? dataset_bc : get_dataset_train_points(eqs, dataset_bc, pinnrep)
 
         # required as Physics loss also needed on the discrete dataset domain points
         # data points are discrete and so by default GridTraining loss applies
-        # passing placeholder dx with GridTraining, it uses data points irl
-        datapde_loss_functions, databc_loss_functions = if (!(dataset_bc isa Nothing) ||
-                                                            !(dataset_pde isa Nothing))
-            merge_strategy_with_loglikelihood_function(pinnrep,
+        # passing placeholder dx with GridTraining, it uses dataset points irl
+        datapde_loss_functions, databc_loss_functions = merge_strategy_with_loglikelihood_function(
+                pinnrep,
                 GridTraining(0.1),
                 datafree_pde_loss_functions,
-                datafree_bc_loss_functions, train_sets_pde = dataset_pde, train_sets_bc = dataset_bc)
-        else
-            (nothing, nothing)
-        end
+                datafree_bc_loss_functions,
+                train_sets_pde = dataset_pde,
+                train_sets_bc = dataset_bc)
 
         function full_loss_function(θ, allstd::Vector{Vector{Float64}})
-            stdpdes, stdbcs, stdextra = allstd
+            stdpdes, stdbcs, stdextra, stdpdesnew = allstd
             # the aggregation happens on cpu even if the losses are gpu, probably fine since it's only a few of them
-            pde_loglikelihoods = [logpdf(Normal(0, stdpdes[i]), pde_loss_function(θ))
-                                  for (i, pde_loss_function) in enumerate(pde_loss_functions)]
 
-            bc_loglikelihoods = [logpdf(Normal(0, stdbcs[j]), bc_loss_function(θ))
-                                 for (j, bc_loss_function) in enumerate(bc_loss_functions)]
+            pde_loglikelihoods = sum([pde_loss_function(θ, stdpdes[i])
+                                      for (i, pde_loss_function) in enumerate(pde_loss_functions)])
+            bc_loglikelihoods = sum([bc_loss_function(θ, stdbcs[j])
+                                     for (j, bc_loss_function) in enumerate(bc_loss_functions)])
 
             if !(datapde_loss_functions isa Nothing)
-                pde_loglikelihoods += [logpdf(Normal(0, stdpdes[j]), pde_loss_function(θ))
-                                       for (j, pde_loss_function) in enumerate(datapde_loss_functions)]
+                pde_loglikelihoods += sum([datapde_loss_function(θ, stdpdes[i])
+                                           for (i, datapde_loss_function) in enumerate(datapde_loss_functions)])
             end
-
             if !(databc_loss_functions isa Nothing)
-                bc_loglikelihoods += [logpdf(Normal(0, stdbcs[j]), bc_loss_function(θ))
-                                      for (j, bc_loss_function) in enumerate(databc_loss_functions)]
+                bc_loglikelihoods += sum([databc_loss_function(θ, stdbcs[j])
+                                          for (j, databc_loss_function) in enumerate(databc_loss_functions)])
             end
 
             # this is kind of a hack, and means that whenever the outer function is evaluated the increment goes up, even if it's not being optimized
@@ -693,14 +719,15 @@ function SciMLBase.symbolic_discretize(pde_system::PDESystem,
             return full_weighted_loglikelihood
         end
 
-        return full_loss_function
+        return bc_loss_functions, pde_loss_functions, full_loss_function
     end
 
-    full_loss_function = get_likelihood_estimate_function(discretization)
+    bc_loss_functions, pde_loss_functions, full_loss_function = get_likelihood_estimate_function(discretization)
+
     pinnrep.loss_functions = PINNLossFunctions(bc_loss_functions, pde_loss_functions,
-        full_loss_function, additional_loss,
-        datafree_pde_loss_functions,
-        datafree_bc_loss_functions)
+                                                full_loss_function, additional_loss,
+                                                datafree_pde_loss_functions,
+                                                datafree_bc_loss_functions)
 
     return pinnrep
 end
diff --git a/src/training_strategies.jl b/src/training_strategies.jl
index 858e93a23..5accfbc91 100644
--- a/src/training_strategies.jl
+++ b/src/training_strategies.jl
@@ -14,35 +14,74 @@ struct GridTraining{T} <: AbstractTrainingStrategy
     dx::T
 end
 
+# dataset must have depvar values for same values of indvars
+function get_dataset_train_points(eqs, train_sets, pinnrep)
+    dict_depvar_input = pinnrep.dict_depvar_input
+    depvars = pinnrep.depvars
+    dict_depvars = pinnrep.dict_depvars
+    dict_indvars = pinnrep.dict_indvars
+
+    symbols_input = [(i, dict_depvar_input[i]) for i in depvars]
+    # [(:u, [:t])]
+    eq_args = NeuralPDE.get_argument(eqs, dict_indvars, dict_depvars)
+    # equation wise indvar presence ~ [[:t]]
+    # in each equation atleast one depvars must be a function of all indvars(to cover heterogenous/not case)
+
+    # train_sets follows order of depvars
+    # take dataset indvar values if for equations depvar's indvar matches input symbol indvar
+    points = []
+    for eq_arg in eq_args
+        eq_points = []
+        for i in eachindex(symbols_input)
+            if symbols_input[i][2] == eq_arg
+                push!(eq_points, train_sets[i][:, 2:end]')
+                # Terminate to avoid repetitive ind var points inclusion
+                break
+            end
+        end
+        # Concatenate points for this equation argument
+        push!(points, vcat(eq_points...))
+    end
+
+    return points
+end
+
 # include dataset points in pde_residual loglikelihood (BayesianPINN)
 function merge_strategy_with_loglikelihood_function(pinnrep::PINNRepresentation,
         strategy::GridTraining,
         datafree_pde_loss_function,
         datafree_bc_loss_function; train_sets_pde = nothing, train_sets_bc = nothing)
     @unpack domains, eqs, bcs, dict_indvars, dict_depvars, flat_init_params = pinnrep
-
+    dx = strategy.dx
     eltypeθ = eltype(pinnrep.flat_init_params)
 
+    # physics loss merge_strategy_with_loglikelihood_function call case
+    if ((train_sets_bc isa Nothing)&&(train_sets_pde isa Nothing))
+        train_sets_pde, train_sets_bc = generate_training_sets(
+            domains, dx, eqs, bcs, eltypeθ,
+            dict_indvars, dict_depvars)
+    end
+
     # is vec as later each _set in pde_train_sets are columns as points transformed to vector of points (pde_train_sets must be rowwise)
     pde_loss_functions = if !(train_sets_pde isa Nothing)
-        pde_train_sets = [train_set[:, 2:end] for train_set in train_sets_pde]
-        pde_train_sets = adapt.(
-            parameterless_type(ComponentArrays.getdata(flat_init_params)),
-            pde_train_sets)
-        [get_loss_function(_loss, _set, eltypeθ, strategy)
-         for (_loss, _set) in zip(datafree_pde_loss_function,
+        # dataset and domain pde losses case
+        pde_train_sets = adapt.(parameterless_type(ComponentArrays.getdata(flat_init_params)),
+            train_sets_pde)
+
+        [get_points_loss_functions(_loss, _set, eltypeθ, strategy)
+                              for (_loss, _set) in zip(datafree_pde_loss_function,
             pde_train_sets)]
     else
         nothing
     end
 
     bc_loss_functions = if !(train_sets_bc isa Nothing)
-        bcs_train_sets = [train_set[:, 2:end] for train_set in train_sets_bc]
-        bcs_train_sets = adapt.(
-            parameterless_type(ComponentArrays.getdata(flat_init_params)),
-            bcs_train_sets)
-        [get_loss_function(_loss, _set, eltypeθ, strategy)
-         for (_loss, _set) in zip(datafree_bc_loss_function, bcs_train_sets)]
+        # dataset and domain bc losses case
+        bcs_train_sets = adapt.(parameterless_type(ComponentArrays.getdata(flat_init_params)),
+            train_sets_bc)
+
+        [get_points_loss_functions(_loss, _set, eltypeθ, strategy)
+                         for (_loss, _set) in zip(datafree_bc_loss_function, bcs_train_sets)]
     else
         nothing
     end
@@ -50,6 +89,18 @@ function merge_strategy_with_loglikelihood_function(pinnrep::PINNRepresentation,
     pde_loss_functions, bc_loss_functions
 end
 
+function get_points_loss_functions(loss_function, train_set, eltypeθ, strategy::GridTraining;
+        τ = nothing)
+        # loss_function length is number of all points loss is being evaluated upon
+        # train sets rows are for each indvar, cols are coordinates (row_1,row_2,..row_n) at which loss evaluated
+    function loss(θ, std)
+        logpdf(
+            MvNormal(loss_function(train_set, θ)[1, :],
+                LinearAlgebra.Diagonal(abs2.(std .* ones(size(train_set)[2])))),
+            zeros(size(train_set)[2]))
+    end
+end
+
 function merge_strategy_with_loss_function(pinnrep::PINNRepresentation,
         strategy::GridTraining,
         datafree_pde_loss_function,
@@ -67,13 +118,13 @@ function merge_strategy_with_loss_function(pinnrep::PINNRepresentation,
         pde_train_sets)
     bcs_train_sets = adapt.(parameterless_type(ComponentArrays.getdata(flat_init_params)),
         bcs_train_sets)
+
     pde_loss_functions = [get_loss_function(_loss, _set, eltypeθ, strategy)
                           for (_loss, _set) in zip(datafree_pde_loss_function,
         pde_train_sets)]
-
     bc_loss_functions = [get_loss_function(_loss, _set, eltypeθ, strategy)
-                         for (_loss, _set) in zip(datafree_bc_loss_function, bcs_train_sets)]
-
+                         for (_loss, _set) in zip(datafree_bc_loss_function,
+                                                   bcs_train_sets)]
     pde_loss_functions, bc_loss_functions
 end
 
diff --git a/test/BPINN_PDE_tests.jl b/test/BPINN_PDE_tests.jl
index 98cacb748..35d62bd35 100644
--- a/test/BPINN_PDE_tests.jl
+++ b/test/BPINN_PDE_tests.jl
@@ -8,7 +8,7 @@ using Flux
 
 Random.seed!(100)
 
-@testset "Example 1: 2D Periodic System" begin
+@testset "Example 1: 1D Periodic System" begin
     # Cos(pi*t) example
     @parameters t
     @variables u(..)
@@ -16,7 +16,7 @@ Random.seed!(100)
     eqs = Dt(u(t)) - cos(2 * π * t) ~ 0
     bcs = [u(0) ~ 0.0]
     domains = [t ∈ Interval(0.0, 2.0)]
-    chainl = Lux.Chain(Lux.Dense(1, 6, tanh), Lux.Dense(6, 1))
+    chainl = Lux.Chain(Lux.Dense(1, 5, tanh), Lux.Dense(5, 5, tanh), Lux.Dense(5, 1))
     initl, st = Lux.setup(Random.default_rng(), chainl)
     @named pde_system = PDESystem(eqs, bcs, domains, [t], [u(t)])
 
@@ -25,8 +25,8 @@ Random.seed!(100)
 
     sol1 = ahmc_bayesian_pinn_pde(pde_system,
         discretization;
-        draw_samples = 1500,
-        bcstd = [0.02],
+        draw_samples = 250,
+        bcstd = [0.001],
         phystd = [0.01],
         priorsNNw = (0.0, 1.0),
         saveats = [1 / 50.0])
@@ -35,8 +35,9 @@ Random.seed!(100)
     ts = vec(sol1.timepoints[1])
     u_real = [analytic_sol_func(0.0, t) for t in ts]
     u_predict = pmean(sol1.ensemblesol[1])
-    @test u_predict≈u_real atol=0.5
-    @test mean(u_predict .- u_real) < 0.1
+
+    @test u_predict≈u_real atol=0.02
+    @test mean(abs.(u_predict .- u_real)) < 1e-3
 end
 
 @testset "Example 2: 1D ODE" begin
@@ -73,7 +74,7 @@ end
     ts = sol1.timepoints[1]
     u_real = vec([analytic_sol_func(t) for t in ts])
     u_predict = pmean(sol1.ensemblesol[1])
-    @test u_predict≈u_real atol=0.8
+    @test u_predict≈u_real atol=0.5
 end
 
 @testset "Example 3: 3rd Degree ODE" begin
@@ -158,10 +159,10 @@ end
 
     sol1 = ahmc_bayesian_pinn_pde(pde_system,
         discretization;
-        draw_samples = 200,
-        bcstd = [0.003, 0.003, 0.003, 0.003],
-        phystd = [0.003],
-        priorsNNw = (0.0, 10.0),
+        draw_samples = 400,
+        bcstd = [0.05, 0.05, 0.05, 0.05],
+        phystd = [0.05],
+        priorsNNw = (0.0, 1.0),
         saveats = [1 / 100.0, 1 / 100.0])
 
     xs = sol1.timepoints[1]
@@ -169,7 +170,9 @@ end
 
     u_predict = pmean(sol1.ensemblesol[1])
     u_real = [analytic_sol_func(xs[:, i][1], xs[:, i][2]) for i in 1:length(xs[1, :])]
-    @test u_predict≈u_real atol=1.5
+
+    @test sum(abs2.(u_predict .- u_real)) < 0.1
+    @test u_predict≈u_real atol=0.1
 end
 
 @testset "Translating from Flux" begin
@@ -207,5 +210,5 @@ end
     ts = sol1.timepoints[1]
     u_real = vec([analytic_sol_func(t) for t in ts])
     u_predict = pmean(sol1.ensemblesol[1])
-    @test u_predict≈u_real atol=0.8
+    @test u_predict≈u_real atol=0.5
 end
diff --git a/test/BPINN_PDEinvsol_tests.jl b/test/BPINN_PDEinvsol_tests.jl
index c8fe60cb0..5cc53df35 100644
--- a/test/BPINN_PDEinvsol_tests.jl
+++ b/test/BPINN_PDEinvsol_tests.jl
@@ -3,7 +3,7 @@ import ModelingToolkit: Interval, infimum, supremum
 using ForwardDiff, Distributions, OrdinaryDiffEq
 using AdvancedHMC, Statistics, Random, Functors
 using NeuralPDE, MonteCarloMeasurements
-using ComponentArrays
+using ComponentArrays, ModelingToolkit
 
 Random.seed!(100)
 
@@ -30,36 +30,11 @@ Random.seed!(100)
 
     analytic_sol_func1(u0, t) = u0 + sin(2 * π * t) / (2 * π)
     timepoints = collect(0.0:(1 / 100.0):2.0)
-    u = [analytic_sol_func1(0.0, timepoint) for timepoint in timepoints]
-    u = u .+ (u .* 0.2) .* randn(size(u))
-    dataset = [hcat(u, timepoints)]
+    u1 = [analytic_sol_func1(0.0, timepoint) for timepoint in timepoints]
+    u1 = u1 .+ (u1 .* 0.2) .* randn(size(u1))
+    dataset = [hcat(u1, timepoints)]
 
-    # checking all training strategies
-    discretization = BayesianPINN([chainl], StochasticTraining(200), param_estim = true,
-        dataset = [dataset, nothing])
-
-    ahmc_bayesian_pinn_pde(pde_system,
-        discretization;
-        draw_samples = 1500,
-        bcstd = [0.05],
-        phystd = [0.01], l2std = [0.01],
-        priorsNNw = (0.0, 1.0),
-        saveats = [1 / 50.0],
-        param = [LogNormal(6.0, 0.5)])
-
-    discretization = BayesianPINN([chainl], QuasiRandomTraining(200), param_estim = true,
-        dataset = [dataset, nothing])
-
-    ahmc_bayesian_pinn_pde(pde_system,
-        discretization;
-        draw_samples = 1500,
-        bcstd = [0.05],
-        phystd = [0.01], l2std = [0.01],
-        priorsNNw = (0.0, 1.0),
-        saveats = [1 / 50.0],
-        param = [LogNormal(6.0, 0.5)])
-
-        # alternative to QuadratureTraining [WIP]
+    # TODO: correct BPINN implementations for Training Strategies.
 
     discretization = BayesianPINN([chainl], GridTraining([0.02]), param_estim = true,
         dataset = [dataset, nothing])
@@ -78,9 +53,9 @@ Random.seed!(100)
     u_real = [analytic_sol_func1(0.0, t) for t in ts]
     u_predict = pmean(sol1.ensemblesol[1])
 
-    @test u_predict≈u_real atol=1.5
-    @test mean(u_predict .- u_real) < 0.1
-    @test sol1.estimated_de_params[1]≈param atol=param * 0.3
+    @test u_predict≈u_real atol=0.1
+    @test mean(u_predict .- u_real) < 0.01
+    @test sol1.estimated_de_params[1]≈param atol=0.1
 end
 
 @testset "Example 2: Lorenz System with parameter estimation" begin
@@ -143,3 +118,171 @@ end
     @test sum(abs, pmean(p_) - 10.00) < 0.3 * idealp[1]
     # @test sum(abs, pmean(p_[2]) - (8 / 3)) < 0.3 * idealp[2]
 end
+
+function recur_expression(exp, Dict_differentials)
+    for in_exp in exp.args
+        if !(in_exp isa Expr)
+            # skip +,== symbols, characters etc
+            continue
+
+        elseif in_exp.args[1] isa ModelingToolkit.Differential
+            # first symbol of differential term
+            # Dict_differentials for masking differential terms
+            # and resubstituting differentials in equations after putting in interpolations
+            # temp = in_exp.args[end]
+            Dict_differentials[eval(in_exp)] = Symbolics.variable("diff_$(length(Dict_differentials) + 1)")
+            return
+        else
+            recur_expression(in_exp, Dict_differentials)
+        end
+    end
+end
+
+@testset "improvement in Solving Parametric Kuromo-Sivashinsky Equation" begin
+    @parameters x, t, α
+    @variables u(..)
+    Dt = Differential(t)
+    Dx = Differential(x)
+    Dx2 = Differential(x)^2
+    Dx3 = Differential(x)^3
+    Dx4 = Differential(x)^4
+
+    # α = 1 (KS equation to be parametric in a)
+    β = 4
+    γ = 1
+    eq = Dt(u(x, t)) + u(x, t) * Dx(u(x, t)) + α * Dx2(u(x, t)) + β * Dx3(u(x, t)) + γ * Dx4(u(x, t)) ~ 0
+
+    u_analytic(x, t; z = -x / 2 + t) = 11 + 15 * tanh(z) - 15 * tanh(z)^2 - 15 * tanh(z)^3
+    du(x, t; z = -x / 2 + t) = 15 / 2 * (tanh(z) + 1) * (3 * tanh(z) - 1) * sech(z)^2
+
+    bcs = [u(x, 0) ~ u_analytic(x, 0),
+        u(-10, t) ~ u_analytic(-10, t),
+        u(10, t) ~ u_analytic(10, t),
+        Dx(u(-10, t)) ~ du(-10, t),
+        Dx(u(10, t)) ~ du(10, t)]
+
+    # Space and time domains
+    domains = [x ∈ Interval(-10.0, 10.0),
+        t ∈ Interval(0.0, 1.0)]
+
+    # Discretization
+    dx = 0.4
+    dt = 0.2
+
+    # Function to compute analytical solution at a specific point (x, t)
+    function u_analytic_point(x, t)
+        z = -x / 2 + t
+        return 11 + 15 * tanh(z) - 15 * tanh(z)^2 - 15 * tanh(z)^3
+    end
+
+    # Function to generate the dataset matrix
+    function generate_dataset_matrix(domains, dx, dt, xlim, tlim)
+        x_values = xlim[1]:dx:xlim[2]
+        t_values = tlim[1]:dt:tlim[2]
+
+        dataset = []
+
+        for t in t_values
+            for x in x_values
+                u_value = u_analytic_point(x, t)
+                push!(dataset, [u_value, x, t])
+            end
+        end
+
+        return vcat([data' for data in dataset]...)
+    end
+
+    # considering sparse dataset from half of x's domain
+    datasetpde_new = [generate_dataset_matrix(domains, dx, dt, [-10, 0], [0.0, 1.0])]
+
+    # Adding Gaussian noise with a 0.8 std
+    noisydataset_new = deepcopy(datasetpde_new)
+    noisydataset_new[1][:, 1] = noisydataset_new[1][:, 1] .+
+                                (randn(size(noisydataset_new[1][:, 1])) .* 0.8)
+
+    # Neural network
+    chain = Lux.Chain(Lux.Dense(2, 8, Lux.tanh),
+        Lux.Dense(8, 8, Lux.tanh),
+        Lux.Dense(8, 1))
+
+    # Discretization for old and new models
+    discretization = NeuralPDE.BayesianPINN([chain],
+        GridTraining([dx, dt]), param_estim = true, dataset = [noisydataset_new, nothing])
+
+    # let α default to 2.0
+    @named pde_system = PDESystem(eq,
+        bcs,
+        domains,
+        [x, t],
+        [u(x, t)],
+        [α],
+        defaults = Dict([α => 2.0]))
+
+    # neccesarry for loss function construction (involves Operator masking) 
+    eqs = pde_system.eqs
+    Dict_differentials = Dict()
+    exps = toexpr.(eqs)
+    nullobj = [recur_expression(exp, Dict_differentials) for exp in exps]
+
+    # Dict_differentials is now ;
+    # Dict{Any, Any} with 5 entries:
+    #   Differential(x)(Differential(x)(u(x, t)))            => diff_5
+    #   Differential(x)(Differential(x)(Differential(x)(u(x… => diff_1
+    #   Differential(x)(Differential(x)(Differential(x)(Dif… => diff_2
+    #   Differential(x)(u(x, t))                             => diff_4
+    #   Differential(t)(u(x, t))                             => diff_3
+
+    # using HMC algorithm due to convergence, stability, time of training. (refer to mcmc chain plots)
+    # choice of std for objectives is very important
+    # pass in Dict_differentials, phystdnew arguments when using the new model
+
+    sol_new = ahmc_bayesian_pinn_pde(pde_system,
+        discretization;
+        draw_samples = 150,
+        bcstd = [0.1, 0.1, 0.1, 0.1, 0.1], phystdnew = [0.2],
+        phystd = [0.2], l2std = [0.5], param = [Distributions.Normal(2.0, 2)],
+        priorsNNw = (0.0, 1.0),
+        saveats = [1 / 100.0, 1 / 100.0],
+        Dict_differentials = Dict_differentials)
+
+    sol_old = ahmc_bayesian_pinn_pde(pde_system,
+        discretization;
+        draw_samples = 150,
+        bcstd = [0.1, 0.1, 0.1, 0.1, 0.1],
+        phystd = [0.2], l2std = [0.5], param = [Distributions.Normal(2.0, 2)],
+        priorsNNw = (0.0, 1.0),
+        saveats = [1 / 100.0, 1 / 100.0])
+
+    phi = discretization.phi[1]
+    xs, ts = [infimum(d.domain):dx:supremum(d.domain)
+              for (d, dx) in zip(domains, [dx / 10, dt])]
+    u_real = [[u_analytic(x, t) for x in xs] for t in ts]
+
+    u_predict_new = [[first(pmean(phi([x, t], sol_new.estimated_nn_params[1]))) for x in xs]
+                     for t in ts]
+
+    diff_u_new = [[abs(u_analytic(x, t) -
+                       first(pmean(phi([x, t], sol_new.estimated_nn_params[1]))))
+                   for x in xs]
+                  for t in ts]
+
+    u_predict_old = [[first(pmean(phi([x, t], sol_old.estimated_nn_params[1]))) for x in xs]
+                     for t in ts]
+    diff_u_old = [[abs(u_analytic(x, t) -
+                       first(pmean(phi([x, t], sol_old.estimated_nn_params[1]))))
+                   for x in xs]
+                  for t in ts]
+
+    @test all(all, [((diff_u_new[i]) .^ 2 .< 0.5) for i in 1:6]) == true
+    @test all(all, [((diff_u_old[i]) .^ 2 .< 0.5) for i in 1:6]) == false
+
+    MSE_new = [sum(abs2, diff_u_new[i]) for i in 1:6]
+    MSE_old = [sum(abs2, diff_u_old[i]) for i in 1:6]
+    @test (MSE_new .< MSE_old) == [1, 1, 1, 1, 1, 1]
+
+    param_new = sol_new.estimated_de_params[1]
+    param_old = sol_old.estimated_de_params[1]
+    α = 1
+    @test abs(param_new - α) < 0.2 * α
+    @test abs(param_new - α) < abs(param_old - α)
+end
\ No newline at end of file
diff --git a/test/BPINN_Tests.jl b/test/BPINN_Tests.jl
index 6534e8840..88e794df8 100644
--- a/test/BPINN_Tests.jl
+++ b/test/BPINN_Tests.jl
@@ -145,7 +145,7 @@ end
     dataset = [x̂, time]
     physsol1 = [linear_analytic(prob.u0, p, time[i]) for i in eachindex(time)]
 
-    # seperate set of points for testing the solve() call (it uses saveat 1/50 hence here length 501)
+    # separate set of points for testing the solve() call (it uses saveat 1/50 hence here length 501)
     time1 = vec(collect(Float64, range(tspan[1], tspan[2], length = 501)))
     physsol2 = [linear_analytic(prob.u0, p, time1[i]) for i in eachindex(time1)]
 
@@ -264,7 +264,7 @@ end
     dataset = [x̂, time]
     physsol1 = [linear_analytic(prob.u0, p, time[i]) for i in eachindex(time)]
 
-    # seperate set of points for testing the solve() call (it uses saveat 1/50 hence here length 501)
+    # separate set of points for testing the solve() call (it uses saveat 1/50 hence here length 501)
     time1 = vec(collect(Float64, range(tspan[1], tspan[2], length = 501)))
     physsol2 = [linear_analytic(prob.u0, p, time1[i]) for i in eachindex(time1)]