diff --git a/bin/process_all.sh b/bin/process_all.sh index 34fbb48..84a23c4 100644 --- a/bin/process_all.sh +++ b/bin/process_all.sh @@ -85,9 +85,11 @@ mkdir -p "${BASEPATH}/users" ############################################################################ # Run Lichen - ./tokenize_all.py "$tmp_location" || { rm -rf "$tmp_location"; exit 1; } - ./hash_all.py "$tmp_location" || { rm -rf "$tmp_location"; exit 1; } - ./compare_hashes.out "$tmp_location" || { rm -rf "$tmp_location"; echo "${KILL_ERROR_MESSAGE}"; exit 1; } + { # We still want to unzip files if an error occurs when running Lichen here + ./tokenize_all.py "$tmp_location" && + ./hash_all.py "$tmp_location" && + ./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}"; + } ############################################################################ # Zip the results back up and send them back to the course's lichen directory diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp index 91eeab5..ed7d4d0 100644 --- a/compare_hashes/compare_hashes.cpp +++ b/compare_hashes/compare_hashes.cpp @@ -157,8 +157,8 @@ int main(int argc, char* argv[]) { std::unordered_set provided_code; // stores all hashes from other gradeables std::unordered_map>> other_gradeables; - // stores the highest match for every student, used later for generating overall_rankings.txt - std::unordered_map> highest_matches; + // stores the matches for every student, used later for generating overall_rankings.txt + std::unordered_map>> highest_matches; // keeps track of max matching hashes across all submissions, used for calculation of ranking score unsigned int max_hashes_matched = 0; @@ -283,7 +283,7 @@ int main(int argc, char* argv[]) { } } - // if the hash doesn't match any of the provided code's hashes, try to find matched between other students + // if the hash doesn't match any of the provided code's hashes, try to find matches between other students if (!provided_match_found) { // look up that hash in the all_hashes table, loop over all other students that have the same hash std::unordered_map> occurences = all_hashes[hash_itr->first]; @@ -333,23 +333,6 @@ int main(int argc, char* argv[]) { continue; } - // Save this submissions highest percent match for later when we generate overall_rankings.txt - float percentMatch = (*submission_itr)->getPercentage(); - unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount(); - Score submission_score(totalMatchingHashes, percentMatch); - if (max_hashes_matched < totalMatchingHashes) { - max_hashes_matched = totalMatchingHashes; - } - - std::unordered_map >::iterator highest_matches_itr = highest_matches.find((*submission_itr)->student()); - std::pair new_pair = {(*submission_itr)->version(), submission_score}; - if (highest_matches_itr == highest_matches.end()) { - highest_matches.insert({(*submission_itr)->student(), new_pair}); - } - else if (submission_score > highest_matches_itr->second.second) { - highest_matches_itr->second = new_pair; - } - // ========================================================================= // Write matches.json file @@ -563,6 +546,19 @@ int main(int argc, char* argv[]) { } } + // ========================================================================= + // Save this submission's highest percent match for later when we generate overall_rankings.txt + float percentMatch = (*submission_itr)->getPercentage(); + unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount(); + Score submission_score(totalMatchingHashes, percentMatch); + if (max_hashes_matched < totalMatchingHashes) { + max_hashes_matched = totalMatchingHashes; + } + + std::pair new_pair = {(*submission_itr)->version(), submission_score}; + highest_matches[(*submission_itr)->student()].push_back(new_pair); + // ========================================================================= + std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter); // create the directory and a file to write into @@ -609,10 +605,18 @@ int main(int argc, char* argv[]) { // take the map of highest matches and convert it to a vector so we can sort it // by percent match and then save it to a file std::vector ranking; - for (std::unordered_map >::iterator itr + for (std::unordered_map>>::iterator itr = highest_matches.begin(); itr != highest_matches.end(); ++itr) { - ranking.push_back(StudentRanking(itr->first, itr->second.first, "", itr->second.second)); - ranking[ranking.size()-1].score.calculateScore(max_hashes_matched); + + std::pair best_score = itr->second.front(); + best_score.second.calculateScore(max_hashes_matched); + for (unsigned int i=0; i < itr->second.size(); i++) { + itr->second[i].second.calculateScore(max_hashes_matched); + if (itr->second[i].second > best_score.second) { + best_score = itr->second[i]; + } + } + ranking.push_back(StudentRanking(itr->first, best_score.first, "", best_score.second)); } std::sort(ranking.begin(), ranking.end(), ranking_sorter); diff --git a/compare_hashes/score.h b/compare_hashes/score.h index cc44450..af085ed 100644 --- a/compare_hashes/score.h +++ b/compare_hashes/score.h @@ -2,6 +2,7 @@ #define SCORE_H #include +#include typedef int location_in_submission; typedef unsigned int hash; diff --git a/tests/data/test_lichen/multiple_versions/expected_output/config.json b/tests/data/test_lichen/multiple_versions/expected_output/config.json new file mode 100644 index 0000000..9687866 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/config.json @@ -0,0 +1,18 @@ +{ + "semester": "f21", + "course": "plagiarism", + "gradeable": "multiple_versions", + "config_id": 1, + "version": "all_versions", + "regex": [ + "" + ], + "regex_dirs": [ + "submissions" + ], + "language": "plaintext", + "threshold": 10, + "hash_size": 4, + "other_gradeables": [], + "ignore_submissions": [] +} \ No newline at end of file diff --git a/tests/data/test_lichen/multiple_versions/expected_output/logs/lichen_job_output.txt b/tests/data/test_lichen/multiple_versions/expected_output/logs/lichen_job_output.txt new file mode 100644 index 0000000..363466d --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/logs/lichen_job_output.txt @@ -0,0 +1,10 @@ +Beginning Lichen run: 2021-12-21 17:20:31 +CONCATENATE ALL...done in 0 seconds, 949 Bytes concatenated +TOKENIZE ALL...done in 0 seconds +HASH ALL...done in 0 seconds +COMPARE HASHES...finished loading in 0 seconds +hash walk: 33% complete +hash walk: 66% complete +hash walk: 100% complete +finished walking in 0 seconds +COMPARE HASHES done in 0 seconds diff --git a/tests/data/test_lichen/multiple_versions/expected_output/other_gradeables/git_placeholder.txt b/tests/data/test_lichen/multiple_versions/expected_output/other_gradeables/git_placeholder.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt new file mode 100644 index 0000000..76bf21e --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt @@ -0,0 +1,2 @@ +aphacker 2 81.4% 35 +bitdiddle 1 81.4% 35 diff --git a/tests/data/test_lichen/multiple_versions/expected_output/provided_code/files/git_placeholder.txt b/tests/data/test_lichen/multiple_versions/expected_output/provided_code/files/git_placeholder.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/test_lichen/multiple_versions/expected_output/provided_code/hashes.txt b/tests/data/test_lichen/multiple_versions/expected_output/provided_code/hashes.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/test_lichen/multiple_versions/expected_output/provided_code/submission.concatenated b/tests/data/test_lichen/multiple_versions/expected_output/provided_code/submission.concatenated new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/test_lichen/multiple_versions/expected_output/provided_code/tokens.json b/tests/data/test_lichen/multiple_versions/expected_output/provided_code/tokens.json new file mode 100644 index 0000000..fe51488 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/provided_code/tokens.json @@ -0,0 +1 @@ +[] diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/hashes.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/hashes.txt new file mode 100644 index 0000000..750fd88 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/hashes.txt @@ -0,0 +1,62 @@ +ccbfc51b +46d2e902 +0a1bc040 +6d14f9b3 +a5d513dd +5e030a24 +c715d526 +fd3fa0fe +b1917b6c +ccbfc51b +fcf8964c +6afa4117 +25a42a47 +2ac066f5 +c6097572 +6011cbf5 +adefe73d +36182b9f +36d719a0 +fe129c06 +e44ef48d +6bb90c04 +083a9efd +93d49734 +0f905a05 +8bfb058d +06410254 +61b171ee +6c920afa +05660ab4 +30a548ac +b38f50f3 +2997d7c5 +297c601f +e8ccd482 +ae6d442f +4de258e3 +fae8aa98 +24ac3d5d +fbdad65f +fc98ba6b +44bbaa49 +83df01b7 +964fade5 +2ea0ba40 +5494f32a +e248b1d9 +528feb65 +27d1db1f +c552988d +cf65191e +eff2064e +0847585b +c64da9e5 +7b3dc1c1 +045fe7d1 +50ac87da +f5f088e7 +ecb2eef0 +7d75f52c +8576ec09 +497a431b \ No newline at end of file diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/matches.json b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/matches.json new file mode 100644 index 0000000..f88e5d3 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/matches.json @@ -0,0 +1,136 @@ +[ + { + "end": 4, + "others": [ + { + "matchingpositions": [ + { + "end": 4, + "start": 1 + }, + { + "end": 13, + "start": 10 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 1, + "type": "match" + }, + { + "end": 6, + "others": [ + { + "matchingpositions": [ + { + "end": 6, + "start": 2 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 2, + "type": "match" + }, + { + "end": 12, + "others": [ + { + "matchingpositions": [ + { + "end": 12, + "start": 8 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 8, + "type": "match" + }, + { + "end": 13, + "others": [ + { + "matchingpositions": [ + { + "end": 4, + "start": 1 + }, + { + "end": 13, + "start": 10 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 10, + "type": "match" + }, + { + "end": 14, + "others": [ + { + "matchingpositions": [ + { + "end": 14, + "start": 11 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 11, + "type": "match" + }, + { + "end": 20, + "others": [ + { + "matchingpositions": [ + { + "end": 32, + "start": 26 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 14, + "type": "match" + }, + { + "end": 34, + "others": [ + { + "matchingpositions": [ + { + "end": 46, + "start": 34 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 22, + "type": "match" + } +] diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt new file mode 100644 index 0000000..f2807a3 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt @@ -0,0 +1 @@ +bitdiddle 1 f21__plagiarism__multiple_versions 32.79% diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/submission.concatenated b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/submission.concatenated new file mode 100644 index 0000000..dc777e0 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/submission.concatenated @@ -0,0 +1,2 @@ +==== submission_1.txt ==== +This file is meant to represent the first submission of three in a small test involving users with multiple submissions. This submission is a little longer than the other two submissions because we want to test that the system still works when the highest matching version is the second version. diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/tokens.json b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/tokens.json new file mode 100644 index 0000000..88b44e4 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/tokens.json @@ -0,0 +1,392 @@ +[ + { + "char": 1, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 2, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 3, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 4, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 6, + "line": 1, + "type": "string", + "value": "submission" + }, + { + "char": 16, + "line": 1, + "type": "punctuation", + "value": "_" + }, + { + "char": 17, + "line": 1, + "type": "number", + "value": 1 + }, + { + "char": 18, + "line": 1, + "type": "punctuation", + "value": "." + }, + { + "char": 19, + "line": 1, + "type": "string", + "value": "txt" + }, + { + "char": 23, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 24, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 25, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 26, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 1, + "line": 2, + "type": "string", + "value": "This" + }, + { + "char": 6, + "line": 2, + "type": "string", + "value": "file" + }, + { + "char": 11, + "line": 2, + "type": "string", + "value": "is" + }, + { + "char": 14, + "line": 2, + "type": "string", + "value": "meant" + }, + { + "char": 20, + "line": 2, + "type": "string", + "value": "to" + }, + { + "char": 23, + "line": 2, + "type": "string", + "value": "represent" + }, + { + "char": 33, + "line": 2, + "type": "string", + "value": "the" + }, + { + "char": 37, + "line": 2, + "type": "string", + "value": "first" + }, + { + "char": 43, + "line": 2, + "type": "string", + "value": "submission" + }, + { + "char": 54, + "line": 2, + "type": "string", + "value": "of" + }, + { + "char": 57, + "line": 2, + "type": "string", + "value": "three" + }, + { + "char": 63, + "line": 2, + "type": "string", + "value": "in" + }, + { + "char": 66, + "line": 2, + "type": "string", + "value": "a" + }, + { + "char": 68, + "line": 2, + "type": "string", + "value": "small" + }, + { + "char": 74, + "line": 2, + "type": "string", + "value": "test" + }, + { + "char": 79, + "line": 2, + "type": "string", + "value": "involving" + }, + { + "char": 89, + "line": 2, + "type": "string", + "value": "users" + }, + { + "char": 95, + "line": 2, + "type": "string", + "value": "with" + }, + { + "char": 100, + "line": 2, + "type": "string", + "value": "multiple" + }, + { + "char": 109, + "line": 2, + "type": "string", + "value": "submissions" + }, + { + "char": 120, + "line": 2, + "type": "punctuation", + "value": "." + }, + { + "char": 123, + "line": 2, + "type": "string", + "value": "This" + }, + { + "char": 128, + "line": 2, + "type": "string", + "value": "submission" + }, + { + "char": 139, + "line": 2, + "type": "string", + "value": "is" + }, + { + "char": 142, + "line": 2, + "type": "string", + "value": "a" + }, + { + "char": 144, + "line": 2, + "type": "string", + "value": "little" + }, + { + "char": 151, + "line": 2, + "type": "string", + "value": "longer" + }, + { + "char": 158, + "line": 2, + "type": "string", + "value": "than" + }, + { + "char": 163, + "line": 2, + "type": "string", + "value": "the" + }, + { + "char": 167, + "line": 2, + "type": "string", + "value": "other" + }, + { + "char": 173, + "line": 2, + "type": "string", + "value": "two" + }, + { + "char": 177, + "line": 2, + "type": "string", + "value": "submissions" + }, + { + "char": 189, + "line": 2, + "type": "string", + "value": "because" + }, + { + "char": 197, + "line": 2, + "type": "string", + "value": "we" + }, + { + "char": 200, + "line": 2, + "type": "string", + "value": "want" + }, + { + "char": 205, + "line": 2, + "type": "string", + "value": "to" + }, + { + "char": 208, + "line": 2, + "type": "string", + "value": "test" + }, + { + "char": 213, + "line": 2, + "type": "string", + "value": "that" + }, + { + "char": 218, + "line": 2, + "type": "string", + "value": "the" + }, + { + "char": 222, + "line": 2, + "type": "string", + "value": "system" + }, + { + "char": 229, + "line": 2, + "type": "string", + "value": "still" + }, + { + "char": 235, + "line": 2, + "type": "string", + "value": "works" + }, + { + "char": 241, + "line": 2, + "type": "string", + "value": "when" + }, + { + "char": 246, + "line": 2, + "type": "string", + "value": "the" + }, + { + "char": 250, + "line": 2, + "type": "string", + "value": "highest" + }, + { + "char": 258, + "line": 2, + "type": "string", + "value": "matching" + }, + { + "char": 267, + "line": 2, + "type": "string", + "value": "version" + }, + { + "char": 275, + "line": 2, + "type": "string", + "value": "is" + }, + { + "char": 278, + "line": 2, + "type": "string", + "value": "the" + }, + { + "char": 282, + "line": 2, + "type": "string", + "value": "second" + }, + { + "char": 289, + "line": 2, + "type": "string", + "value": "version" + }, + { + "char": 296, + "line": 2, + "type": "punctuation", + "value": "." + } +] diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/hashes.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/hashes.txt new file mode 100644 index 0000000..99d4526 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/hashes.txt @@ -0,0 +1,43 @@ +ccbfc51b +46d2e902 +0a1bc040 +b8cd789e +50fb6242 +2ddc3046 +9343e0c2 +fd3fa0fe +b1917b6c +ccbfc51b +fcf8964c +4c7b788b +3c78a77e +0480aa34 +76b55240 +d9fa30ce +e7e56e4a +122d719c +e5056ce3 +9b510320 +134f15c7 +09af10f8 +2ae680fc +12fbeaef +9e2a0ae6 +2ac066f5 +c6097572 +6011cbf5 +adefe73d +bdadb582 +88c09423 +916c300e +44f252c0 +6bb90c04 +083a9efd +93d49734 +0f905a05 +8bfb058d +06410254 +61b171ee +6c920afa +05660ab4 +30a548ac \ No newline at end of file diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/matches.json b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/matches.json new file mode 100644 index 0000000..6f55541 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/matches.json @@ -0,0 +1,118 @@ +[ + { + "end": 4, + "others": [ + { + "matchingpositions": [ + { + "end": 4, + "start": 1 + }, + { + "end": 13, + "start": 10 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 1, + "type": "match" + }, + { + "end": 6, + "others": [ + { + "matchingpositions": [ + { + "end": 6, + "start": 2 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 2, + "type": "match" + }, + { + "end": 12, + "others": [ + { + "matchingpositions": [ + { + "end": 12, + "start": 8 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 8, + "type": "match" + }, + { + "end": 13, + "others": [ + { + "matchingpositions": [ + { + "end": 4, + "start": 1 + }, + { + "end": 13, + "start": 10 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 10, + "type": "match" + }, + { + "end": 32, + "others": [ + { + "matchingpositions": [ + { + "end": 32, + "start": 11 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 11, + "type": "match" + }, + { + "end": 46, + "others": [ + { + "matchingpositions": [ + { + "end": 46, + "start": 34 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "bitdiddle", + "version": 1 + } + ], + "start": 34, + "type": "match" + } +] diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt new file mode 100644 index 0000000..ceaae6b --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt @@ -0,0 +1 @@ +bitdiddle 1 f21__plagiarism__multiple_versions 80.95% diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/submission.concatenated b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/submission.concatenated new file mode 100644 index 0000000..6ffb4eb --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/submission.concatenated @@ -0,0 +1,2 @@ +==== submission_2.txt ==== +This is a test file for the Lichen plagiarism detection system. This file is meant to represent the second submission of three in a small test involving users with multiple submissions. diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/tokens.json b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/tokens.json new file mode 100644 index 0000000..25e88bc --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/tokens.json @@ -0,0 +1,278 @@ +[ + { + "char": 1, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 2, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 3, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 4, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 6, + "line": 1, + "type": "string", + "value": "submission" + }, + { + "char": 16, + "line": 1, + "type": "punctuation", + "value": "_" + }, + { + "char": 17, + "line": 1, + "type": "number", + "value": 2 + }, + { + "char": 18, + "line": 1, + "type": "punctuation", + "value": "." + }, + { + "char": 19, + "line": 1, + "type": "string", + "value": "txt" + }, + { + "char": 23, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 24, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 25, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 26, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 1, + "line": 2, + "type": "string", + "value": "This" + }, + { + "char": 6, + "line": 2, + "type": "string", + "value": "is" + }, + { + "char": 9, + "line": 2, + "type": "string", + "value": "a" + }, + { + "char": 11, + "line": 2, + "type": "string", + "value": "test" + }, + { + "char": 16, + "line": 2, + "type": "string", + "value": "file" + }, + { + "char": 21, + "line": 2, + "type": "string", + "value": "for" + }, + { + "char": 25, + "line": 2, + "type": "string", + "value": "the" + }, + { + "char": 29, + "line": 2, + "type": "string", + "value": "Lichen" + }, + { + "char": 36, + "line": 2, + "type": "string", + "value": "plagiarism" + }, + { + "char": 47, + "line": 2, + "type": "string", + "value": "detection" + }, + { + "char": 57, + "line": 2, + "type": "string", + "value": "system" + }, + { + "char": 63, + "line": 2, + "type": "punctuation", + "value": "." + }, + { + "char": 66, + "line": 2, + "type": "string", + "value": "This" + }, + { + "char": 71, + "line": 2, + "type": "string", + "value": "file" + }, + { + "char": 76, + "line": 2, + "type": "string", + "value": "is" + }, + { + "char": 79, + "line": 2, + "type": "string", + "value": "meant" + }, + { + "char": 85, + "line": 2, + "type": "string", + "value": "to" + }, + { + "char": 88, + "line": 2, + "type": "string", + "value": "represent" + }, + { + "char": 98, + "line": 2, + "type": "string", + "value": "the" + }, + { + "char": 102, + "line": 2, + "type": "string", + "value": "second" + }, + { + "char": 109, + "line": 2, + "type": "string", + "value": "submission" + }, + { + "char": 120, + "line": 2, + "type": "string", + "value": "of" + }, + { + "char": 123, + "line": 2, + "type": "string", + "value": "three" + }, + { + "char": 129, + "line": 2, + "type": "string", + "value": "in" + }, + { + "char": 132, + "line": 2, + "type": "string", + "value": "a" + }, + { + "char": 134, + "line": 2, + "type": "string", + "value": "small" + }, + { + "char": 140, + "line": 2, + "type": "string", + "value": "test" + }, + { + "char": 145, + "line": 2, + "type": "string", + "value": "involving" + }, + { + "char": 155, + "line": 2, + "type": "string", + "value": "users" + }, + { + "char": 161, + "line": 2, + "type": "string", + "value": "with" + }, + { + "char": 166, + "line": 2, + "type": "string", + "value": "multiple" + }, + { + "char": 175, + "line": 2, + "type": "string", + "value": "submissions" + }, + { + "char": 186, + "line": 2, + "type": "punctuation", + "value": "." + } +] diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/hashes.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/hashes.txt new file mode 100644 index 0000000..9d7b5c3 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/hashes.txt @@ -0,0 +1,43 @@ +ccbfc51b +46d2e902 +0a1bc040 +b37a3b73 +17f0039d +ca413f89 +5c7d0821 +fd3fa0fe +b1917b6c +ccbfc51b +fcf8964c +4c7b788b +3c78a77e +0480aa34 +76b55240 +d9fa30ce +e7e56e4a +122d719c +e5056ce3 +9b510320 +134f15c7 +09af10f8 +2ae680fc +12fbeaef +9e2a0ae6 +2ac066f5 +c6097572 +6011cbf5 +adefe73d +6d1d0ed1 +b718f1ae +bb067147 +7303ecef +6bb90c04 +083a9efd +93d49734 +0f905a05 +8bfb058d +06410254 +61b171ee +6c920afa +05660ab4 +30a548ac \ No newline at end of file diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/matches.json b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/matches.json new file mode 100644 index 0000000..6ab9251 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/matches.json @@ -0,0 +1,239 @@ +[ + { + "end": 4, + "others": [ + { + "matchingpositions": [ + { + "end": 4, + "start": 1 + }, + { + "end": 13, + "start": 10 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 1 + }, + { + "matchingpositions": [ + { + "end": 4, + "start": 1 + }, + { + "end": 13, + "start": 10 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 2 + } + ], + "start": 1, + "type": "match" + }, + { + "end": 6, + "others": [ + { + "matchingpositions": [ + { + "end": 6, + "start": 2 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 1 + }, + { + "matchingpositions": [ + { + "end": 6, + "start": 2 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 2 + } + ], + "start": 2, + "type": "match" + }, + { + "end": 12, + "others": [ + { + "matchingpositions": [ + { + "end": 12, + "start": 8 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 1 + }, + { + "matchingpositions": [ + { + "end": 12, + "start": 8 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 2 + } + ], + "start": 8, + "type": "match" + }, + { + "end": 13, + "others": [ + { + "matchingpositions": [ + { + "end": 4, + "start": 1 + }, + { + "end": 13, + "start": 10 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 1 + }, + { + "matchingpositions": [ + { + "end": 4, + "start": 1 + }, + { + "end": 13, + "start": 10 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 2 + } + ], + "start": 10, + "type": "match" + }, + { + "end": 14, + "others": [ + { + "matchingpositions": [ + { + "end": 14, + "start": 11 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 1 + }, + { + "matchingpositions": [ + { + "end": 14, + "start": 11 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 2 + } + ], + "start": 11, + "type": "match" + }, + { + "end": 28, + "others": [ + { + "matchingpositions": [ + { + "end": 28, + "start": 12 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 2 + } + ], + "start": 12, + "type": "match" + }, + { + "end": 32, + "others": [ + { + "matchingpositions": [ + { + "end": 20, + "start": 14 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 1 + }, + { + "matchingpositions": [ + { + "end": 32, + "start": 26 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 2 + } + ], + "start": 26, + "type": "match" + }, + { + "end": 46, + "others": [ + { + "matchingpositions": [ + { + "end": 34, + "start": 22 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 1 + }, + { + "matchingpositions": [ + { + "end": 46, + "start": 34 + } + ], + "source_gradeable": "f21__plagiarism__multiple_versions", + "username": "aphacker", + "version": 2 + } + ], + "start": 34, + "type": "match" + } +] diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt new file mode 100644 index 0000000..8f2a405 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt @@ -0,0 +1,2 @@ +aphacker 2 f21__plagiarism__multiple_versions 80.95% +aphacker 1 f21__plagiarism__multiple_versions 47.62% diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/submission.concatenated b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/submission.concatenated new file mode 100644 index 0000000..0e371d1 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/submission.concatenated @@ -0,0 +1,2 @@ +==== submission_3.txt ==== +This is a test file for the Lichen plagiarism detection system. This file is meant to represent the third submission of three in a small test involving users with multiple submissions. diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/tokens.json b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/tokens.json new file mode 100644 index 0000000..2a2989e --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/tokens.json @@ -0,0 +1,278 @@ +[ + { + "char": 1, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 2, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 3, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 4, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 6, + "line": 1, + "type": "string", + "value": "submission" + }, + { + "char": 16, + "line": 1, + "type": "punctuation", + "value": "_" + }, + { + "char": 17, + "line": 1, + "type": "number", + "value": 3 + }, + { + "char": 18, + "line": 1, + "type": "punctuation", + "value": "." + }, + { + "char": 19, + "line": 1, + "type": "string", + "value": "txt" + }, + { + "char": 23, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 24, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 25, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 26, + "line": 1, + "type": "punctuation", + "value": "=" + }, + { + "char": 1, + "line": 2, + "type": "string", + "value": "This" + }, + { + "char": 6, + "line": 2, + "type": "string", + "value": "is" + }, + { + "char": 9, + "line": 2, + "type": "string", + "value": "a" + }, + { + "char": 11, + "line": 2, + "type": "string", + "value": "test" + }, + { + "char": 16, + "line": 2, + "type": "string", + "value": "file" + }, + { + "char": 21, + "line": 2, + "type": "string", + "value": "for" + }, + { + "char": 25, + "line": 2, + "type": "string", + "value": "the" + }, + { + "char": 29, + "line": 2, + "type": "string", + "value": "Lichen" + }, + { + "char": 36, + "line": 2, + "type": "string", + "value": "plagiarism" + }, + { + "char": 47, + "line": 2, + "type": "string", + "value": "detection" + }, + { + "char": 57, + "line": 2, + "type": "string", + "value": "system" + }, + { + "char": 63, + "line": 2, + "type": "punctuation", + "value": "." + }, + { + "char": 66, + "line": 2, + "type": "string", + "value": "This" + }, + { + "char": 71, + "line": 2, + "type": "string", + "value": "file" + }, + { + "char": 76, + "line": 2, + "type": "string", + "value": "is" + }, + { + "char": 79, + "line": 2, + "type": "string", + "value": "meant" + }, + { + "char": 85, + "line": 2, + "type": "string", + "value": "to" + }, + { + "char": 88, + "line": 2, + "type": "string", + "value": "represent" + }, + { + "char": 98, + "line": 2, + "type": "string", + "value": "the" + }, + { + "char": 102, + "line": 2, + "type": "string", + "value": "third" + }, + { + "char": 108, + "line": 2, + "type": "string", + "value": "submission" + }, + { + "char": 119, + "line": 2, + "type": "string", + "value": "of" + }, + { + "char": 122, + "line": 2, + "type": "string", + "value": "three" + }, + { + "char": 128, + "line": 2, + "type": "string", + "value": "in" + }, + { + "char": 131, + "line": 2, + "type": "string", + "value": "a" + }, + { + "char": 133, + "line": 2, + "type": "string", + "value": "small" + }, + { + "char": 139, + "line": 2, + "type": "string", + "value": "test" + }, + { + "char": 144, + "line": 2, + "type": "string", + "value": "involving" + }, + { + "char": 154, + "line": 2, + "type": "string", + "value": "users" + }, + { + "char": 160, + "line": 2, + "type": "string", + "value": "with" + }, + { + "char": 165, + "line": 2, + "type": "string", + "value": "multiple" + }, + { + "char": 174, + "line": 2, + "type": "string", + "value": "submissions" + }, + { + "char": 185, + "line": 2, + "type": "punctuation", + "value": "." + } +] diff --git a/tests/data/test_lichen/multiple_versions/input/config.json b/tests/data/test_lichen/multiple_versions/input/config.json new file mode 100644 index 0000000..9687866 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/input/config.json @@ -0,0 +1,18 @@ +{ + "semester": "f21", + "course": "plagiarism", + "gradeable": "multiple_versions", + "config_id": 1, + "version": "all_versions", + "regex": [ + "" + ], + "regex_dirs": [ + "submissions" + ], + "language": "plaintext", + "threshold": 10, + "hash_size": 4, + "other_gradeables": [], + "ignore_submissions": [] +} \ No newline at end of file diff --git a/tests/data/test_lichen/multiple_versions/input/f21/plagiarism/submissions/multiple_versions/aphacker/1/submission_1.txt b/tests/data/test_lichen/multiple_versions/input/f21/plagiarism/submissions/multiple_versions/aphacker/1/submission_1.txt new file mode 100644 index 0000000..8e0cb11 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/input/f21/plagiarism/submissions/multiple_versions/aphacker/1/submission_1.txt @@ -0,0 +1 @@ +This file is meant to represent the first submission of three in a small test involving users with multiple submissions. This submission is a little longer than the other two submissions because we want to test that the system still works when the highest matching version is the second version. \ No newline at end of file diff --git a/tests/data/test_lichen/multiple_versions/input/f21/plagiarism/submissions/multiple_versions/aphacker/2/submission_2.txt b/tests/data/test_lichen/multiple_versions/input/f21/plagiarism/submissions/multiple_versions/aphacker/2/submission_2.txt new file mode 100644 index 0000000..ec9cee0 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/input/f21/plagiarism/submissions/multiple_versions/aphacker/2/submission_2.txt @@ -0,0 +1 @@ +This is a test file for the Lichen plagiarism detection system. This file is meant to represent the second submission of three in a small test involving users with multiple submissions. \ No newline at end of file diff --git a/tests/data/test_lichen/multiple_versions/input/f21/plagiarism/submissions/multiple_versions/bitdiddle/1/submission_3.txt b/tests/data/test_lichen/multiple_versions/input/f21/plagiarism/submissions/multiple_versions/bitdiddle/1/submission_3.txt new file mode 100644 index 0000000..6569e28 --- /dev/null +++ b/tests/data/test_lichen/multiple_versions/input/f21/plagiarism/submissions/multiple_versions/bitdiddle/1/submission_3.txt @@ -0,0 +1 @@ +This is a test file for the Lichen plagiarism detection system. This file is meant to represent the third submission of three in a small test involving users with multiple submissions. \ No newline at end of file diff --git a/tests/integration/test.py b/tests/integration/test.py index 810294f..b974596 100644 --- a/tests/integration/test.py +++ b/tests/integration/test.py @@ -21,26 +21,35 @@ def testLichen(self): data_path = Path(test_data_dir, "test_lichen", test_case, "input") shutil.copyfile(Path(data_path, "config.json"), Path(temp_dir, "config.json")) + # change the group for the temporary directory such that it matches the input group + subprocess.check_call(f"chgrp -R {data_path.group()} {temp_dir}", shell=True) + # run Lichen subprocess.check_call(f"bash {str(lichen_installation_dir)}/bin/process_all.sh {str(temp_dir)} {str(data_path)}", shell=True) ex_output_path = Path(test_data_dir, "test_lichen", test_case, "expected_output") # compare the output and expected output directory structure and file contents + ignored_files_count = 0 ex_files_count = 0 for root, dirs, files in os.walk(ex_output_path): ex_files_count += len(dirs) + len(files) for file in files: - if file != "lichen_job_output.txt" and file != "git_placeholder.txt": - ex_path = Path(root, file) - if root.replace(str(ex_output_path), "") == "": - act_path = Path(temp_dir, file) - else: - act_path = Path(temp_dir, root.replace(str(ex_output_path), "").strip("/"), file) + if file == "lichen_job_output.txt": + continue + if file == "git_placeholder.txt": + ignored_files_count += 1 + continue + + ex_path = Path(root, file) + if root.replace(str(ex_output_path), "") == "": + act_path = Path(temp_dir, file) + else: + act_path = Path(temp_dir, root.replace(str(ex_output_path), "").strip("/"), file) - with open(ex_path) as ex_file: - with open(act_path) as act_file: - self.assertEqual(ex_file.read().strip(), act_file.read().strip()) + with open(ex_path) as ex_file: + with open(act_path) as act_file: + self.assertEqual(ex_file.read().strip(), act_file.read().strip()) for dir in dirs: ex_path = Path(root, dir) @@ -52,10 +61,8 @@ def testLichen(self): self.assertTrue(os.path.isdir(act_path)) act_files_count = 0 - for _, dirs, files in os.walk(temp_dir): + for _, dirs, files in os.walk(Path(temp_dir)): act_files_count += len(dirs) + len(files) - # NOTE: We must subtract two here because git doesn't store empty directories - # This will have to change in the future when we add more test gradeables - # may not have empty directories - self.assertEqual(ex_files_count - 2, act_files_count) + # ensure that we didn't miss any files by checking that there are the same number of files in each directory + self.assertEqual(ex_files_count - ignored_files_count, act_files_count)