-
Notifications
You must be signed in to change notification settings - Fork 0
/
converter.py
110 lines (91 loc) · 3.57 KB
/
converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from docx import Document
import pdfplumber
import json
countWordEducation = 0
countWordExperience = 0
countWordSkill = 0
countFiles = 0
def pdf_to_text(pdf_file_path):
try:
text = ""
with open(pdf_file_path, 'rb') as file:
reader = pdfplumber.PdfFileReader(file)
for page_num in range(reader.numPages):
page = reader.getPage(page_num)
text += page.extractText() + '\n'
return text
except Exception as e:
print(f"Ошибка при чтении файла: {e}")
return None
def docx_to_text(file_path):
global countFiles
global countWordEducation
global countWordExperience
global countWordSkill
try:
text = ""
if ".docx" in file_path:
# Преобразование .docx файла в текст
doc = Document(file_path)
for paragraph in doc.paragraphs:
text += paragraph.text.replace("\t"," ").strip() + '\n'
else:
# Преобразование .pdf файла в текст
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() + '\n'
text = text.lower()
if "education" in text or "образование" in text:
countWordEducation += 1
if "experience" in text or "опыт работы" in text:
countWordExperience += 1
countWordSkill += 1 if "skill" in text or "навыки" in text else 0
countFiles += 1
work_with_text(text)
return text
except Exception as e:
print(f"Ошибка при чтении файла: {e}")
return None
def get_resume_texts(folder_path):
resume_texts = []
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
text = docx_to_text(file_path)
if text:
temp = dict()
temp["content"] = text
resume_texts.append(temp)
return resume_texts
def generate_wordcloud(texts):
all_text = ' '.join(texts)
wordcloud = WordCloud(width=800, height=800,
background_color='white',
stopwords=None,
min_font_size=10).generate(all_text)
# Отображение облака слов
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
def work_with_text(text):
if "akhundov damat" in text:
print(text)
if __name__ == "__main__":
folder_path = "resume"
resume_texts = get_resume_texts(folder_path)
if resume_texts:
print(resume_texts)
#generate_wordcloud(resume_texts)
print("Кол-во файло: \n", countFiles)
print("Кол-во слов education встреченных хотя бы раз в файле: \n", countWordEducation)
print("Кол-во слов experience встреченных хотя бы раз в файле: \n", countWordExperience)
print("Кол-во слов skill встреченных хотя бы раз в файле: \n", countWordSkill)
with open('resume_texts.json', 'w', encoding='utf-8') as json_file:
json.dump(resume_texts, json_file, ensure_ascii=False, indent=4)
print("JSON файл успешно сохранен.")
else:
print("Не удалось сконвертировать файлы.")