[Bugfix:Plagiarism] Fix all versions bug (#72)

* Fix multiple versions bug * add config.json * fix course * Add placeholder file * FIx tests(?)
Submitty · Jan 29, 2022 · 1fe7878 · 1fe7878
1 parent a45457e
commit 1fe7878
Show file tree

Hide file tree

Showing 31 changed files with 1,705 additions and 40 deletions.
diff --git a/bin/process_all.sh b/bin/process_all.sh
@@ -85,9 +85,11 @@ mkdir -p "${BASEPATH}/users"
 
     ############################################################################
     # Run Lichen
-    ./tokenize_all.py    "$tmp_location" || { rm -rf "$tmp_location"; exit 1; }
-    ./hash_all.py        "$tmp_location" || { rm -rf "$tmp_location"; exit 1; }
-    ./compare_hashes.out "$tmp_location" || { rm -rf "$tmp_location"; echo "${KILL_ERROR_MESSAGE}"; exit 1; }
+    {  # We still want to unzip files if an error occurs when running Lichen here
+      ./tokenize_all.py    "$tmp_location" &&
+      ./hash_all.py        "$tmp_location" &&
+      ./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}";
+    }
 
     ############################################################################
     # Zip the results back up and send them back to the course's lichen directory

diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
@@ -157,8 +157,8 @@ int main(int argc, char* argv[]) {
   std::unordered_set<hash> provided_code;
   // stores all hashes from other gradeables
   std::unordered_map<hash, std::unordered_map<user_id, std::vector<HashLocation>>> other_gradeables;
-  // stores the highest match for every student, used later for generating overall_rankings.txt
-  std::unordered_map<user_id, std::pair<int, Score>> highest_matches;
+  // stores the matches for every student, used later for generating overall_rankings.txt
+  std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>> highest_matches;
   // keeps track of max matching hashes across all submissions, used for calculation of ranking score
   unsigned int max_hashes_matched = 0;
 
@@ -283,7 +283,7 @@ int main(int argc, char* argv[]) {
         }
       }
 
-      // if the hash doesn't match any of the provided code's hashes, try to find matched between other students
+      // if the hash doesn't match any of the provided code's hashes, try to find matches between other students
       if (!provided_match_found) {
         // look up that hash in the all_hashes table, loop over all other students that have the same hash
         std::unordered_map<std::string, std::vector<HashLocation>> occurences = all_hashes[hash_itr->first];
@@ -333,23 +333,6 @@ int main(int argc, char* argv[]) {
       continue;
     }
 
-    // Save this submissions highest percent match for later when we generate overall_rankings.txt
-    float percentMatch = (*submission_itr)->getPercentage();
-    unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount();
-    Score submission_score(totalMatchingHashes, percentMatch);
-    if (max_hashes_matched < totalMatchingHashes) {
-      max_hashes_matched = totalMatchingHashes;
-    }
-
-    std::unordered_map<user_id, std::pair<int, Score> >::iterator highest_matches_itr = highest_matches.find((*submission_itr)->student());
-    std::pair<int, Score> new_pair = {(*submission_itr)->version(), submission_score};
-    if (highest_matches_itr == highest_matches.end()) {
-      highest_matches.insert({(*submission_itr)->student(), new_pair});
-    }
-    else if (submission_score > highest_matches_itr->second.second) {
-      highest_matches_itr->second = new_pair;
-    }
-
     // =========================================================================
     // Write matches.json file
 
@@ -563,6 +546,19 @@ int main(int argc, char* argv[]) {
       }
     }
 
+    // =========================================================================
+    // Save this submission's highest percent match for later when we generate overall_rankings.txt
+    float percentMatch = (*submission_itr)->getPercentage();
+    unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount();
+    Score submission_score(totalMatchingHashes, percentMatch);
+    if (max_hashes_matched < totalMatchingHashes) {
+      max_hashes_matched = totalMatchingHashes;
+    }
+
+    std::pair<version_number, Score> new_pair = {(*submission_itr)->version(), submission_score};
+    highest_matches[(*submission_itr)->student()].push_back(new_pair);
+    // =========================================================================
+
     std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter);
 
     // create the directory and a file to write into
@@ -609,10 +605,18 @@ int main(int argc, char* argv[]) {
   // take the map of highest matches and convert it to a vector so we can sort it
   // by percent match and then save it to a file
   std::vector<StudentRanking> ranking;
-  for (std::unordered_map<user_id, std::pair<int, Score> >::iterator itr
+  for (std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>>::iterator itr
         = highest_matches.begin(); itr != highest_matches.end(); ++itr) {
-    ranking.push_back(StudentRanking(itr->first, itr->second.first, "", itr->second.second));
-    ranking[ranking.size()-1].score.calculateScore(max_hashes_matched);
+
+    std::pair<version_number, Score> best_score = itr->second.front();
+    best_score.second.calculateScore(max_hashes_matched);
+    for (unsigned int i=0; i < itr->second.size(); i++) {
+      itr->second[i].second.calculateScore(max_hashes_matched);
+      if (itr->second[i].second > best_score.second) {
+        best_score = itr->second[i];
+      }
+    }
+    ranking.push_back(StudentRanking(itr->first, best_score.first, "", best_score.second));
   }
 
   std::sort(ranking.begin(), ranking.end(), ranking_sorter);

diff --git a/compare_hashes/score.h b/compare_hashes/score.h
@@ -2,6 +2,7 @@
 #define SCORE_H
 
 #include <cassert>
+#include <string>
 
 typedef int location_in_submission;
 typedef unsigned int hash;

diff --git a/tests/data/test_lichen/multiple_versions/expected_output/config.json b/tests/data/test_lichen/multiple_versions/expected_output/config.json
@@ -0,0 +1,18 @@
+{
+    "semester": "f21",
+    "course": "plagiarism",
+    "gradeable": "multiple_versions",
+    "config_id": 1,
+    "version": "all_versions",
+    "regex": [
+        ""
+    ],
+    "regex_dirs": [
+        "submissions"
+    ],
+    "language": "plaintext",
+    "threshold": 10,
+    "hash_size": 4,
+    "other_gradeables": [],
+    "ignore_submissions": []
+}
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/logs/lichen_job_output.txt b/tests/data/test_lichen/multiple_versions/expected_output/logs/lichen_job_output.txt
@@ -0,0 +1,10 @@
+Beginning Lichen run: 2021-12-21 17:20:31
+CONCATENATE ALL...done in 0 seconds, 949 Bytes concatenated
+TOKENIZE ALL...done in 0 seconds
+HASH ALL...done in 0 seconds
+COMPARE HASHES...finished loading in 0 seconds
+hash walk: 33% complete
+hash walk: 66% complete
+hash walk: 100% complete
+finished walking in 0 seconds
+COMPARE HASHES done in 0 seconds
diff --git a/...s/data/test_lichen/multiple_versions/expected_output/other_gradeables/git_placeholder.txt b/...s/data/test_lichen/multiple_versions/expected_output/other_gradeables/git_placeholder.txt
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt
@@ -0,0 +1,2 @@
+aphacker              2    81.4%      35
+bitdiddle             1    81.4%      35
diff --git a/...ata/test_lichen/multiple_versions/expected_output/provided_code/files/git_placeholder.txt b/...ata/test_lichen/multiple_versions/expected_output/provided_code/files/git_placeholder.txt
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/provided_code/hashes.txt b/tests/data/test_lichen/multiple_versions/expected_output/provided_code/hashes.txt
diff --git a/.../data/test_lichen/multiple_versions/expected_output/provided_code/submission.concatenated b/.../data/test_lichen/multiple_versions/expected_output/provided_code/submission.concatenated
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/provided_code/tokens.json b/tests/data/test_lichen/multiple_versions/expected_output/provided_code/tokens.json
@@ -0,0 +1 @@
+[]
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/hashes.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/hashes.txt
@@ -0,0 +1,62 @@
+ccbfc51b
+46d2e902
+0a1bc040
+6d14f9b3
+a5d513dd
+5e030a24
+c715d526
+fd3fa0fe
+b1917b6c
+ccbfc51b
+fcf8964c
+6afa4117
+25a42a47
+2ac066f5
+c6097572
+6011cbf5
+adefe73d
+36182b9f
+36d719a0
+fe129c06
+e44ef48d
+6bb90c04
+083a9efd
+93d49734
+0f905a05
+8bfb058d
+06410254
+61b171ee
+6c920afa
+05660ab4
+30a548ac
+b38f50f3
+2997d7c5
+297c601f
+e8ccd482
+ae6d442f
+4de258e3
+fae8aa98
+24ac3d5d
+fbdad65f
+fc98ba6b
+44bbaa49
+83df01b7
+964fade5
+2ea0ba40
+5494f32a
+e248b1d9
+528feb65
+27d1db1f
+c552988d
+cf65191e
+eff2064e
+0847585b
+c64da9e5
+7b3dc1c1
+045fe7d1
+50ac87da
+f5f088e7
+ecb2eef0
+7d75f52c
+8576ec09
+497a431b
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/matches.json b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/matches.json
@@ -0,0 +1,136 @@
+[
+    {
+        "end": 4,
+        "others": [
+            {
+                "matchingpositions": [
+                    {
+                        "end": 4,
+                        "start": 1
+                    },
+                    {
+                        "end": 13,
+                        "start": 10
+                    }
+                ],
+                "source_gradeable": "f21__plagiarism__multiple_versions",
+                "username": "bitdiddle",
+                "version": 1
+            }
+        ],
+        "start": 1,
+        "type": "match"
+    },
+    {
+        "end": 6,
+        "others": [
+            {
+                "matchingpositions": [
+                    {
+                        "end": 6,
+                        "start": 2
+                    }
+                ],
+                "source_gradeable": "f21__plagiarism__multiple_versions",
+                "username": "bitdiddle",
+                "version": 1
+            }
+        ],
+        "start": 2,
+        "type": "match"
+    },
+    {
+        "end": 12,
+        "others": [
+            {
+                "matchingpositions": [
+                    {
+                        "end": 12,
+                        "start": 8
+                    }
+                ],
+                "source_gradeable": "f21__plagiarism__multiple_versions",
+                "username": "bitdiddle",
+                "version": 1
+            }
+        ],
+        "start": 8,
+        "type": "match"
+    },
+    {
+        "end": 13,
+        "others": [
+            {
+                "matchingpositions": [
+                    {
+                        "end": 4,
+                        "start": 1
+                    },
+                    {
+                        "end": 13,
+                        "start": 10
+                    }
+                ],
+                "source_gradeable": "f21__plagiarism__multiple_versions",
+                "username": "bitdiddle",
+                "version": 1
+            }
+        ],
+        "start": 10,
+        "type": "match"
+    },
+    {
+        "end": 14,
+        "others": [
+            {
+                "matchingpositions": [
+                    {
+                        "end": 14,
+                        "start": 11
+                    }
+                ],
+                "source_gradeable": "f21__plagiarism__multiple_versions",
+                "username": "bitdiddle",
+                "version": 1
+            }
+        ],
+        "start": 11,
+        "type": "match"
+    },
+    {
+        "end": 20,
+        "others": [
+            {
+                "matchingpositions": [
+                    {
+                        "end": 32,
+                        "start": 26
+                    }
+                ],
+                "source_gradeable": "f21__plagiarism__multiple_versions",
+                "username": "bitdiddle",
+                "version": 1
+            }
+        ],
+        "start": 14,
+        "type": "match"
+    },
+    {
+        "end": 34,
+        "others": [
+            {
+                "matchingpositions": [
+                    {
+                        "end": 46,
+                        "start": 34
+                    }
+                ],
+                "source_gradeable": "f21__plagiarism__multiple_versions",
+                "username": "bitdiddle",
+                "version": 1
+            }
+        ],
+        "start": 22,
+        "type": "match"
+    }
+]
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt
@@ -0,0 +1 @@
+bitdiddle        1    f21__plagiarism__multiple_versions   32.79%
diff --git a/...ta/test_lichen/multiple_versions/expected_output/users/aphacker/1/submission.concatenated b/...ta/test_lichen/multiple_versions/expected_output/users/aphacker/1/submission.concatenated
@@ -0,0 +1,2 @@
+==== submission_1.txt ====
+This file is meant to represent the first submission of three in a small test involving users with multiple submissions.  This submission is a little longer than the other two submissions because we want to test that the system still works when the highest matching version is the second version.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		bitdiddle 1 f21__plagiarism__multiple_versions 32.79%
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		==== submission_1.txt ====
		This file is meant to represent the first submission of three in a small test involving users with multiple submissions. This submission is a little longer than the other two submissions because we want to test that the system still works when the highest matching version is the second version.