Merge pull request #19 from Smit-tay/main

New function(s) to locate function definitions in R scripts.
dark-peak-analytics · Feb 4, 2024 · 8518799 · 8518799
2 parents 98fe70d + f51b4e4
commit 8518799
Show file tree

Hide file tree

Showing 7 changed files with 290 additions and 8 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,7 @@ export(check_markov_trace)
 export(check_trans_prob_array)
 export(check_trans_prob_mat)
 export(extract_function_name)
+export(find_function_definitions)
 export(find_next_vector_element)
 export(find_previous_vector_element)
 export(get_active_functions)

diff --git a/R/cheers_checker.R b/R/cheers_checker.R
@@ -29,20 +29,27 @@ find_next_vector_element <- function(value, vector) {
 #' @description Find the previous element of the vector before a value
 #' @param value A value of numeric values
 #' @param vector A vector of numeric values
+#' @param LTE a boolean to determine collection on "less than" or "less than equal"
 #' @return The previous element of the vector before the value
 #' @export
 #' @examples
 #' \dontrun{
 #' find_previous_vector_element(value = 5, vector = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
 #' }
 #'
-find_previous_vector_element <- function(value, vector){
+find_previous_vector_element <- function(value, vector, LTE=FALSE){
+
    # Find the elements in the vector that are less than the specified value
-   less_than_value <- vector[vector < value]
+   if(LTE==TRUE) {
+      less_than_value <- vector[vector <= value]
+   } else {
+      less_than_value <- vector[vector < value]
+   }
 
-  # If there are no elements less than the specified value, return the value
+  # If there are no elements less than the specified value, 
+  # return the minimum value found in the vector
    if (length(less_than_value) == 0) {
-     return(value)
+     return (min(vector))
    }
 
   # Find the maximum value among the elements less than the specified value
@@ -90,12 +97,136 @@ extract_function_name <- function(string){
 
 }
 
-#assertHE::extract_function_name(string = "#' }
-#
-#create_Markov_trace <- function(transition_matrix_,")
 
+#' @title Helper function for find_function_definitions
+#'
+#' @description Do not try to call this - it is only a help 
+#' and is only to keep find_function_definitions clean
+#' These two functions could really be combined - a task for another day
+#' 
+#' @param v_func_lnums A vector of "FUNCTION" line numbers
+#' 
+#' @param v_assign_lnums A vector of "XXX_ASSIGN" line number
+#' 
+#' @param v_symbol_lnums a vector of "SYMBOL" line numbers
+#' 
+#' @param parsed_data the dataframe result of running utils::getParseData
+#' 
+#' @return Filteresd vector of function line numbers
+#'
+filter_func_lnums <- function(v_func_lnums, v_assign_lnums, v_symbol_lnums, parsed_data) {
+
+  # Create an empty vector to store filtered v_func_lnums
+  filtered_v_func_lnums <- c()
+
+  # Iterate over each v_func_lnum
+  for (func_lnum in v_func_lnums) {
+
+    # Find the line number of the nearest v_assign_lnum before the v_func_lnum
+    assign_lnum <- max(v_assign_lnums[v_assign_lnums <= func_lnum])
+
+    # Find the line number of the nearest v_symbol_lnum before the v_func_lnum
+    symbol_lnum <- max(v_symbol_lnums[v_symbol_lnums <= func_lnum])
+
+    # Check if both assign_lnum and symbol_lnum exist 
+    # and that the assign doesn't come before the symbol.
+    #
+    # Ignoring uninteresting parse data like "expr", "NUM_CONST", etc
+    # R parse data for a function definition can only exist in this order:
+    # SYMBOL, ASSIGN, FUNCTION
+    if ( is.na(assign_lnum) || is.na(symbol_lnum)  || assign_lnum < symbol_lnum) {
+      next
+    }
+
+    # if assign_lnum == symbol_lnum, 
+    # Then we might have a single line like this:
+    #
+    # foo <- function(A){}   # i.e. a function definition
+    #
+    # or we may have the use of a Lambda (anonymous) function like this:
+    #
+    # result <- apply(data, 2, function(x) { sqrt(mean(x^2)) })
+    #
+    # This is NOT a funtion definition (well, it is, but, not one we care about) !
+    # in this case, there will (hopefully) always be a SYMBOL between 
+    # FUNCTION and XXX_ASSIGN.
+    #
+    # check that the col1 value for assign is greater than for symbol
+    if(assign_lnum == symbol_lnum){
+
+      assign_col1 <-  max(  parsed_data[ parsed_data$line1 == assign_lnum 
+                                        & (parsed_data$token == "LEFT_ASSIGN" 
+                                            | parsed_data$token == "EQ_ASSIGN") , 
+                                        "col1"] )
+
+      symbol_col1 <-  max(  parsed_data[ parsed_data$line1 == symbol_lnum 
+                                         & parsed_data$token == "SYMBOL" ,
+                                        "col1"] )
+
+      if(symbol_col1 >= assign_col1){
+        # These aren't the droids you're looking for.
+        next
+      }
+    }
+
+    # If we get this far, we're pretty confident we have a function definition.
+    # Add the v_func_lnum to the filtered vector
+    filtered_v_func_lnums <- c(filtered_v_func_lnums, func_lnum)
+
+  }
+
+  # Return the filtered vector
+  return(filtered_v_func_lnums)
+}
 
+#' @title Parses an R source file, returns function names defined within.
+#'
+#' @description Using utils::getParseData(), searches for function definitions
+#' by matching the FUNCTION keyword (i.e. "function") 
+#' with it's associated SYMBOL (i.e the function name)
+#'
+#' @param filename A string containing a path to an R source file
+#'
+#' @return A string containing the function names
+#'
+#' @importFrom stringr str_locate_all str_replace_all
+#'
+#' @export
+#'
+find_function_definitions <- function(filename) {
+
+  # Parse the R code
+  parsed_data <-
+    utils::getParseData(parse(filename, keep.source = TRUE))
+
+  # get the line numbers containing the function keyword 
+  # identified by token value "FUNCTION"
+  v_func_lnums <-  parsed_data[parsed_data$token == "FUNCTION", "line1"]
+
+# find line numbers which are identified as assignments 
+# named functions get assigned names - Lambdas (anonymous funcitons) don't !
+  v_assign_lnums <- parsed_data[parsed_data$token == "LEFT_ASSIGN" | parsed_data$token == "EQ_ASSIGN", "line1"]
+
+  # find line numbers which are identified as symbol definitions
+  # Function names are identiffied as SYMBOLs
+  v_symbol_lnums <- parsed_data[parsed_data$token == "SYMBOL", "line1"]
+
+  # Try to filter the use of the keyword "function" when it is used
+  # as a Lambda (anonymous) function.
+  v_func_lnums <- filter_func_lnums(v_func_lnums, v_assign_lnums, v_symbol_lnums, parsed_data)
+
+  # for each function location find the immediately preceeding symbols location
+  v_symbols_preceding_functions <- sapply(X = v_func_lnums,
+                                          FUN = find_previous_vector_element, 
+                                          vector=v_symbol_lnums, LTE=TRUE)
+
+  # Extract the symbol names (stored in the "text" element)
+  # from the data.frame located at the line numbers that identify symbols
+  function_symbols <- parsed_data[  parsed_data$line1 %in% v_symbols_preceding_functions 
+                                  & parsed_data$token == "SYMBOL", "text"]
+  return(function_symbols)
 
+}
 
 #' @title Get cheers classification tags from a given file
 #' @description For a provided filepath, identify the cheers classification tags

diff --git a/man/filter_func_lnums.Rd b/man/filter_func_lnums.Rd
diff --git a/man/find_function_definitions.Rd b/man/find_function_definitions.Rd
diff --git a/man/find_previous_vector_element.Rd b/man/find_previous_vector_element.Rd
diff --git a/tests/testthat/example_scripts/example_tricky_functions.R b/tests/testthat/example_scripts/example_tricky_functions.R
@@ -0,0 +1,84 @@
+
+# Messy function 1: Unclear variable names, inconsistent formatting
+do_something_random <- function(x, y) {
+  result <- x + y * 2  # No spaces around operators
+  return(result)  # Unnecessary return statement
+}
+
+# Messy function 2: Redundant parentheses, unnecessary comments
+calculate_something <- function(data) {
+  filtered_data <- data[data$value > 10, ]  # Redundant parentheses
+  # Calculate the mean (average)
+  mean_value <- mean(filtered_data$value)  # Unnecessary comment
+  return(mean_value)
+}
+
+# Messy function 3: Nested loops, inconsistent indentation
+find_matches <- function(list1, list2) {
+  matches <- vector()
+  for (item1 in list1) {
+    for (item2 in list2) {
+      if (item1 == item2) {
+        matches <- append(matches, item1)  # Inefficient use of append
+      }
+    }
+  }
+  return(matches)
+}
+
+# Messy function 4: Global variables, side effects
+global_var <- 0
+perform_task <- function(input) {
+  global_var <<- global_var + input  # Modifying global variable
+  return(global_var)
+}
+
+# Messy function 5: Long lines, no comments
+combine_strings <- function(str1, str2, sep = " ") {
+  combined_string <- paste(str1, str2, sep = sep)  # Long line, no comment
+  return(combined_string)
+}
+
+# Messy function 6: Unclear logic, repetitive code
+process_data <- function(data) {
+  if (some_condition) {
+    # Do something
+  } else if (another_condition) {
+    # Do something else
+  } else {
+    # Do something different
+  }
+  # More repetitive code
+}
+
+# Messy function 7: Deeply nested function calls, hard to read
+transform_data = function(data) {
+  result <- apply(data, 2,  #comment
+  # comment
+    function(x) {
+    sqrt(mean(x^2))  # Nested function call, multiple operations
+  })
+  return(result)
+}
+
+# Messy function 8: Unnecessary temporary variables, inefficient sorting
+sort_values <- function(values) {
+  temp <- sort(values)  # Unnecessary temporary variable
+  sorted_values <- temp  # Inefficient sorting
+  return(sorted_values)
+}
+
+# Messy function 9: Magic numbers, unclear purpose
+generate_output <- function() {
+  output <- list(value1 = 42, value2 = 3.14)  # Magic numbers
+  return(output)
+}
+
+# Messy function 10: Overly complex, hard to maintain
+do_everything <- function(data) {
+  # Combine multiple steps, hard to follow
+  result <- process_data(data)
+  result <- transform_data(result)
+  output <- generate_output(result)
+  return(output)
+}
diff --git a/tests/testthat/test-cheers_checker.R b/tests/testthat/test-cheers_checker.R
@@ -228,3 +228,23 @@ test_that("get_folder_cheers_classifications works for an example project",
                                                      cheers_pattern =  "@family") |> nrow()
             expect_true(object = tmp > 0)
           })
+
+test_that("find_function_definitions works as intended",
+    {
+      expect_equal(
+        object = find_function_definitions(
+                    filename = testthat::test_path("example_scripts", "example_tricky_functions.R")),
+        expected = c( 
+            "do_something_random"
+          , "calculate_something"
+          , "find_matches"
+          , "perform_task"
+          , "combine_strings"
+          , "process_data"
+          , "transform_data"
+          , "sort_values"
+          , "generate_output"
+          , "do_everything")
+      )
+    })
+