Add literature parsing

deepvk · Dec 7, 2023 · 0b5d265 · 0b5d265
1 parent 1778153
commit 0b5d265
Show file tree

Hide file tree

Showing 6 changed files with 296 additions and 149 deletions.
diff --git a/README.md b/README.md
@@ -2,12 +2,12 @@
 
 This parser is using scrapy lib.
 
-Currently, program parses tests from the Unified State Exam (EGE or OGE)
-in Social Studies from the [soc-ege.sdamgia](https://soc-ege.sdamgia.ru/?redir=1) and [soc-oge.sdamgia](https://soc-oge.sdamgia.ru/?redir=1) websites.
+Currently, program parses tests from the Unified State Exam (EGE or OGE) 
+from the [sdamgia](https://sdamgia.ru/?redir=1) website.
 
 ## Structure
 
-Program takes exam type, test id and the desired output file
+Program takes exam subject, exam type, test id and the desired output file
 name as command-line arguments. The parsing result is supposed to be stored in a jsonl file.
 
 Additionally, in the *goat* folder, there is a script called **dataset_demonstration.py**.
@@ -24,16 +24,16 @@ First, you need to install the necessary libraries. To do this, run the followin
 To run the parser, navigate to the goat directory
 and run the following command in the console:
 
-`scrapy crawl ege -a exam_type='your exam type' -a test_id='your test id' -O <output file>`
+`scrapy crawl sdamgia -a subject='your exam subject' -a exam_type='your exam type' -a test_id='your test id' -O <output file>`
+
+*your exam subject* indicates which subject the exam is in. Currently acceptable subject values are 'soc' and 'lit'.
 
 *your exam type* indicates from what exam your test was taken. Currently acceptable exam type values are 'ege' and 'oge'.
 
 *your test id* is the test id for the Social Studies exams from the [soc-ege.sdamgia](https://soc-ege.sdamgia.ru/?redir=1) or [soc-oge.sdamgia](https://soc-oge.sdamgia.ru/?redir=1) websites.
 
 *output file* is file name that parser will generate or overwrite with parsing output. For example - ege_data.jsonl.
 
-Parser was tested on the test with id - 10861731.
-
 To run the dataset_demonstration.py script, execute the following command in the root directory:
 
 `python .\goat\dataset_demonstration.py <parser output file name>`

diff --git a/goat/dataset_demonstration.py b/goat/dataset_demonstration.py
@@ -21,6 +21,8 @@
  print(sootv_tasks["solution_text"].iloc[0])
  print("----------")
  print(sootv_tasks["answer"].iloc[0])
+ print("----------")
+ print(sootv_tasks["task_points"].iloc[0])
  print("------------------------------")
 
 mult_choice_tasks = sdamgia_data.loc[sdamgia_data["task_type"] == TaskType.MULT_CHOICE]
@@ -31,6 +33,8 @@
  print(mult_choice_tasks["solution_text"].iloc[0])
  print("----------")
  print(mult_choice_tasks["answer"].iloc[0])
+ print("----------")
+ print(mult_choice_tasks["task_points"].iloc[0])
  print("------------------------------")
 
 text_answer_tasks = sdamgia_data.loc[sdamgia_data["task_type"] == TaskType.TEXT_ANSWER]
@@ -44,6 +48,8 @@
  print("No answer")
  else:
  print(text_answer_tasks["answer"].iloc[0])
+ print("----------")
+ print(text_answer_tasks["task_points"].iloc[0])
  print("------------------------------")
 
 based_on_text_tasks = sdamgia_data.loc[sdamgia_data["task_type"] == TaskType.QUESTION_ON_TEXT]
@@ -57,3 +63,5 @@
  print("No answer")
  else:
  print(based_on_text_tasks["answer"].iloc[0])
+ print("----------")
+ print(based_on_text_tasks["task_points"].iloc[0])
diff --git a/goat/ege_data.jsonl b/goat/ege_data.jsonl
diff --git a/goat/items.py b/goat/items.py
@@ -20,6 +20,8 @@ class SdamgiaTaskItem(scrapy.Item):
  url = scrapy.Field()
  fipi_links = scrapy.Field()
  sources_links = scrapy.Field()
+ task_points = scrapy.Field()
+ criteria_table = scrapy.Field()
  html_task = scrapy.Field()
 
  pass
diff --git a/goat/spider_utils.py b/goat/spider_utils.py
@@ -18,62 +18,271 @@ class TaskType(str, Enum):
  QUESTION_ON_TEXT = "QUESTION_ON_TEXT"
 
 
+class SdamgiaExamSubject(str, Enum):
+ MATH = "MATH"
+ MATHB = "MATHB"
+ PHYS = "PHYS"
+ INF = "INF"
+ RUS = "RUS"
+ BIO = "BIO"
+ EN = "EN"
+ CHEM = "CHEM"
+ GEO = "GEO"
+ SOC = "SOC"
+ DE = "DE"
+ FR = "FR"
+ LIT = "LIT"
+ SP = "SP"
+ HIST = "HIST"
+
+
 _BASE_DOMAIN = "sdamgia.ru"
 _SUBJECT_BASE_URL_ege = {
- "math": f"https://math-ege.{_BASE_DOMAIN}",
- "mathb": f"https://mathb-ege.{_BASE_DOMAIN}",
- "phys": f"https://phys-ege.{_BASE_DOMAIN}",
- "inf": f"https://inf-ege.{_BASE_DOMAIN}",
- "rus": f"https://rus-ege.{_BASE_DOMAIN}",
- "bio": f"https://bio-ege.{_BASE_DOMAIN}",
- "en": f"https://en-ege.{_BASE_DOMAIN}",
- "chem": f"https://chem-ege.{_BASE_DOMAIN}",
- "geo": f"https://geo-ege.{_BASE_DOMAIN}",
- "soc": f"https://soc-ege.{_BASE_DOMAIN}",
- "de": f"https://de-ege.{_BASE_DOMAIN}",
- "fr": f"https://fr-ege.{_BASE_DOMAIN}",
- "lit": f"https://lit-ege.{_BASE_DOMAIN}",
- "sp": f"https://sp-ege.{_BASE_DOMAIN}",
- "hist": f"https://hist-ege.{_BASE_DOMAIN}",
+ "MATH": f"https://math-ege.{_BASE_DOMAIN}",
+ "MATHB": f"https://mathb-ege.{_BASE_DOMAIN}",
+ "PHYS": f"https://phys-ege.{_BASE_DOMAIN}",
+ "INF": f"https://inf-ege.{_BASE_DOMAIN}",
+ "RUS": f"https://rus-ege.{_BASE_DOMAIN}",
+ "BIO": f"https://bio-ege.{_BASE_DOMAIN}",
+ "EN": f"https://en-ege.{_BASE_DOMAIN}",
+ "CHEM": f"https://chem-ege.{_BASE_DOMAIN}",
+ "GEO": f"https://geo-ege.{_BASE_DOMAIN}",
+ "SOC": f"https://soc-ege.{_BASE_DOMAIN}",
+ "DE": f"https://de-ege.{_BASE_DOMAIN}",
+ "FR": f"https://fr-ege.{_BASE_DOMAIN}",
+ "LIT": f"https://lit-ege.{_BASE_DOMAIN}",
+ "SP": f"https://sp-ege.{_BASE_DOMAIN}",
+ "HIST": f"https://hist-ege.{_BASE_DOMAIN}",
 }
 _SUBJECT_BASE_URL_oge = {
- "math": f"https://math-oge.{_BASE_DOMAIN}",
- "mathb": f"https://mathb-oge.{_BASE_DOMAIN}",
- "phys": f"https://phys-oge.{_BASE_DOMAIN}",
- "inf": f"https://inf-oge.{_BASE_DOMAIN}",
- "rus": f"https://rus-oge.{_BASE_DOMAIN}",
- "bio": f"https://bio-oge.{_BASE_DOMAIN}",
- "en": f"https://en-oge.{_BASE_DOMAIN}",
- "chem": f"https://chem-oge.{_BASE_DOMAIN}",
- "geo": f"https://geo-oge.{_BASE_DOMAIN}",
- "soc": f"https://soc-oge.{_BASE_DOMAIN}",
- "de": f"https://de-oge.{_BASE_DOMAIN}",
- "fr": f"https://fr-oge.{_BASE_DOMAIN}",
- "lit": f"https://lit-oge.{_BASE_DOMAIN}",
- "sp": f"https://sp-oge.{_BASE_DOMAIN}",
- "hist": f"https://hist-oge.{_BASE_DOMAIN}",
+ "MATH": f"https://math-oge.{_BASE_DOMAIN}",
+ "MATHB": f"https://mathb-oge.{_BASE_DOMAIN}",
+ "PHYS": f"https://phys-oge.{_BASE_DOMAIN}",
+ "INF": f"https://inf-oge.{_BASE_DOMAIN}",
+ "RUS": f"https://rus-oge.{_BASE_DOMAIN}",
+ "BIO": f"https://bio-oge.{_BASE_DOMAIN}",
+ "EN": f"https://en-oge.{_BASE_DOMAIN}",
+ "CHEM": f"https://chem-oge.{_BASE_DOMAIN}",
+ "GEO": f"https://geo-oge.{_BASE_DOMAIN}",
+ "SOC": f"https://soc-oge.{_BASE_DOMAIN}",
+ "DE": f"https://de-oge.{_BASE_DOMAIN}",
+ "FR": f"https://fr-oge.{_BASE_DOMAIN}",
+ "LIT": f"https://lit-oge.{_BASE_DOMAIN}",
+ "SP": f"https://sp-oge.{_BASE_DOMAIN}",
+ "HIST": f"https://hist-oge.{_BASE_DOMAIN}",
 }
 _RESHU_CT = "reshuct.by"
 _SUBJECT_BASE_URL_ct = {
- "math": f"https://math3.{_RESHU_CT}",
- "mathb": f"https://math3b.{_RESHU_CT}",
- "phys": f"https://phys.{_RESHU_CT}",
- "inf": f"https://inf.{_RESHU_CT}",
- "rus": f"https://rus.{_RESHU_CT}",
- "bio": f"https://bio.{_RESHU_CT}",
- "en": f"https://en.{_RESHU_CT}",
- "chem": f"https://chem.{_RESHU_CT}",
- "geo": f"https://geo.{_RESHU_CT}",
- "soc": f"https://soc.{_RESHU_CT}",
- "de": f"https://de.{_RESHU_CT}",
- "fr": f"https://fr.{_RESHU_CT}",
- "lit": f"https://lit.{_RESHU_CT}",
- "sp": f"https://sp.{_RESHU_CT}",
- "wh": f"https://wh.{_RESHU_CT}",
- "bh": f"https://bh.{_RESHU_CT}",
+ "MATH": f"https://math3.{_RESHU_CT}",
+ "MATHB": f"https://math3b.{_RESHU_CT}",
+ "PHYS": f"https://phys.{_RESHU_CT}",
+ "INF": f"https://inf.{_RESHU_CT}",
+ "RUS": f"https://rus.{_RESHU_CT}",
+ "BIO": f"https://bio.{_RESHU_CT}",
+ "EN": f"https://en.{_RESHU_CT}",
+ "CHEM": f"https://chem.{_RESHU_CT}",
+ "GEO": f"https://geo.{_RESHU_CT}",
+ "SOC": f"https://soc.{_RESHU_CT}",
+ "DE": f"https://de.{_RESHU_CT}",
+ "FR": f"https://fr.{_RESHU_CT}",
+ "LIT": f"https://lit.{_RESHU_CT}",
+ "SP": f"https://sp.{_RESHU_CT}",
+ "WH": f"https://wh.{_RESHU_CT}",
+ "BH": f"https://bh.{_RESHU_CT}",
 }
 
 
+def determine_soc_task_type(exam_type: ExamType, topic_id: str) -> TaskType:
+ if exam_type == ExamType.EGE and topic_id in (
+ "1",
+ "2",
+ "4",
+ "5",
+ "7",
+ "8",
+ "9",
+ "10",
+ "11",
+ "12",
+ "14",
+ "16",
+ ):
+ task_type = TaskType.MULT_CHOICE
+
+ elif exam_type == ExamType.OGE and topic_id in (
+ "2",
+ "3",
+ "4",
+ "7",
+ "8",
+ "9",
+ "10",
+ "11",
+ "13",
+ "14",
+ "16",
+ "17",
+ "18",
+ ):
+ task_type = TaskType.MULT_CHOICE
+
+ elif exam_type == ExamType.EGE and topic_id in ("3", "6", "13", "15"):
+ task_type = TaskType.SOOTV
+
+ elif exam_type == ExamType.OGE and topic_id in ("15, 19"):
+ task_type = TaskType.SOOTV
+
+ elif exam_type == ExamType.EGE and topic_id in ("21", "22", "23", "24", "25"):
+ task_type = TaskType.TEXT_ANSWER
+
+ elif exam_type == ExamType.OGE and topic_id in ("1", "5", "6", "12", "20"):
+ task_type = TaskType.TEXT_ANSWER
+
+ elif exam_type == ExamType.OGE and topic_id in ("21", "22", "23", "24"):
+ task_type = TaskType.QUESTION_ON_TEXT
+
+ elif exam_type == ExamType.EGE and topic_id in ("17", "18", "19", "20"):
+ task_type = TaskType.QUESTION_ON_TEXT
+
+ else:
+ raise Exception("Wrong parsed task_type")
+
+ return task_type
+
+
+def determine_lit_task_type(exam_type: ExamType, topic_id: str) -> TaskType:
+ if exam_type == ExamType.EGE and topic_id in ("11"):
+ task_type = TaskType.TEXT_ANSWER
+
+ elif exam_type == ExamType.OGE and topic_id in ("5"):
+ task_type = TaskType.TEXT_ANSWER
+
+ elif exam_type == ExamType.OGE and topic_id in ("1", "2", "3", "4"):
+ task_type = TaskType.QUESTION_ON_TEXT
+
+ elif exam_type == ExamType.EGE and topic_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"):
+ task_type = TaskType.QUESTION_ON_TEXT
+
+ else:
+ raise Exception("Wrong parsed task_type")
+
+ return task_type
+
+
+def determine_task_type(subject: str, exam_type: str, topic_id: str) -> TaskType:
+ if subject == SdamgiaExamSubject.SOC:
+ return determine_soc_task_type(exam_type, topic_id)
+ elif subject == SdamgiaExamSubject.LIT:
+ return determine_lit_task_type(exam_type, topic_id)
+ else:
+ raise Exception("Not supported exam subject")
+
+
+def determine_soc_task_points(exam_type: ExamType, topic_id: str) -> TaskType:
+ if exam_type == ExamType.EGE and topic_id in ("1", "3", "9", "12"):
+ task_points = 1
+
+ elif exam_type == ExamType.EGE and topic_id in (
+ "2",
+ "4",
+ "5",
+ "6",
+ "7",
+ "8",
+ "10",
+ "11",
+ "13",
+ "14",
+ "15",
+ "16",
+ "17",
+ "18",
+ ):
+ task_points = 2
+
+ elif exam_type == ExamType.EGE and topic_id in ("19", "20", "21", "23"):
+ task_points = 3
+
+ elif exam_type == ExamType.EGE and topic_id in ("22", "24"):
+ task_points = 4
+
+ elif exam_type == ExamType.EGE and topic_id in ("25"):
+ task_points = 6
+
+ elif exam_type == ExamType.OGE and topic_id in (
+ "2",
+ "3",
+ "4",
+ "7",
+ "8",
+ "9",
+ "10",
+ "11",
+ "13",
+ "14",
+ "16",
+ "17",
+ "18",
+ "19",
+ "20",
+ ):
+ task_points = 1
+
+ elif exam_type == ExamType.OGE and topic_id in ("1", "6", "15", "21", "22", "24"):
+ task_points = 2
+
+ elif exam_type == ExamType.OGE and topic_id in ("5", "23"):
+ task_points = 3
+
+ elif exam_type == ExamType.OGE and topic_id in ("12"):
+ task_points = 4
+
+ else:
+ raise Exception("Wrong parsed task_type")
+
+ return task_points
+
+
+def determine_lit_task_points(exam_type: ExamType, topic_id: str) -> TaskType:
+ if exam_type == ExamType.EGE and topic_id in ("1", "2", "3", "6", "7", "8"):
+ task_points = 1
+
+ elif exam_type == ExamType.EGE and topic_id in ("4", "9"):
+ task_points = 4
+
+ elif exam_type == ExamType.EGE and topic_id in ("5", "10"):
+ task_points = 8
+
+ elif exam_type == ExamType.EGE and topic_id in ("11"):
+ task_points = 18
+
+ elif exam_type == ExamType.OGE and topic_id in ("1", "3"):
+ task_points = 4
+
+ elif exam_type == ExamType.OGE and topic_id in ("2"):
+ task_points = 5
+
+ elif exam_type == ExamType.OGE and topic_id in ("4"):
+ task_points = 8
+
+ elif exam_type == ExamType.OGE and topic_id in ("5"):
+ task_points = 16
+
+ else:
+ raise Exception("Wrong parsed task_type")
+
+ return task_points
+
+
+def determine_task_points(subject: str, exam_type: str, topic_id: str) -> TaskType:
+ if subject == SdamgiaExamSubject.SOC:
+ return determine_soc_task_points(exam_type, topic_id)
+ elif subject == SdamgiaExamSubject.LIT:
+ return determine_lit_task_points(exam_type, topic_id)
+ else:
+ raise Exception("Not supported exam subject")
+
+
 def get_exam_link(subject: str, exam_type: ExamType) -> str:
  if exam_type == ExamType.OGE:
  return _SUBJECT_BASE_URL_oge[subject]