From 7fac675bb1810bc09ad29181ce151586f35b0ca3 Mon Sep 17 00:00:00 2001 From: agusev Date: Tue, 8 Oct 2024 16:21:18 +0300 Subject: [PATCH 1/2] =?UTF-8?q?=D0=A1=D0=BF=D0=B8=D1=81=D0=BA=D0=B8=20?= =?UTF-8?q?=D0=BC=D0=BE=D0=B4=D0=B5=D1=80=D0=B0=D1=82=D0=BE=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=20=D0=B8=20=D0=B2=D0=B5=D0=B4=D1=83=D1=89=D0=B8=D1=85=20?= =?UTF-8?q?=D0=B2=D1=8B=D0=BD=D0=B5=D0=BD=D1=81=D0=B5=D0=BD=D1=8B=20=D0=B2?= =?UTF-8?q?=20=D0=BA=D0=BE=D0=BD=D1=84=D0=B8=D0=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/parser/main.go | 2 +- cmd/service/main.go | 2 +- internal/config/config.go | 6 +++ .../qaparser/questionanswer/questionanswer.go | 45 ++++++++++++------- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/cmd/parser/main.go b/cmd/parser/main.go index e213c25..a717ff9 100644 --- a/cmd/parser/main.go +++ b/cmd/parser/main.go @@ -38,7 +38,7 @@ func main() { time.Sleep(100 * time.Millisecond) log.Printf("Task %v processed\n", taskID.String()) - entry := questionanswer.NewEntry(taskID) + entry := questionanswer.NewEntry(taskID, cfg) client := httpclient.New(nil) err := entry.FetchData(client) diff --git a/cmd/service/main.go b/cmd/service/main.go index fc7617e..1d69cc5 100644 --- a/cmd/service/main.go +++ b/cmd/service/main.go @@ -39,7 +39,7 @@ func main() { time.Sleep(100 * time.Millisecond) log.Printf("Task %v processed\n", taskID.String()) - entry := questionanswer.NewEntry(taskID) + entry := questionanswer.NewEntry(taskID, cfg) client := httpclient.New(nil) err := entry.FetchData(client) diff --git a/internal/config/config.go b/internal/config/config.go index 88cc112..1c04339 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -15,6 +15,7 @@ type Config struct { RandomDelay *time.Duration `yaml:"random_delay" env-default:"150s"` ManticoreIndex []Index `yaml:"manticore"` EntryChanBuffer int `yaml:"entry_chan_buffer" env-default:"20"` + Questionanswer Questionanswer `yaml:"questionanswer"` Splitter Splitter `yaml:"splitter"` Parsers []Parser `yaml:"parsers"` } @@ -23,6 +24,11 @@ type Index struct { Name string `yaml:"index"` } +type Questionanswer struct { + Moderator []string `yaml:"moderator"` + Responsible []string `yaml:"responsible"` +} + type Parser struct { Url string `yaml:"url"` Current bool `yaml:"current" env-default:"true"` diff --git a/internal/qaparser/questionanswer/questionanswer.go b/internal/qaparser/questionanswer/questionanswer.go index 91d20f4..50a5a24 100644 --- a/internal/qaparser/questionanswer/questionanswer.go +++ b/internal/qaparser/questionanswer/questionanswer.go @@ -10,19 +10,22 @@ import ( "unicode/utf8" "github.com/PuerkitoBio/goquery" + "github.com/terratensor/svodd-server/internal/config" "github.com/terratensor/svodd-server/internal/lib/httpclient" "golang.org/x/net/html" ) type Entry struct { - Url *url.URL - Title string - Video *url.URL - Datetime *time.Time - Content []QuestionAnswer - Fragments []Fragment - Comments []Comment - Html string + Url *url.URL + Title string + Video *url.URL + Datetime *time.Time + Content []QuestionAnswer + Fragments []Fragment + Comments []Comment + Html string + moderator []string + responsible []string } type Fragment struct { @@ -48,9 +51,22 @@ type Comment struct { const TypeComment = 3 -func NewEntry(url *url.URL) *Entry { +var defaultModerator = []string{"Ведущий", "Ведущая:", "Дмитрий Таран:", "Сергей Будков:", "ВедущийЯ:", "Айнис Казимирович Петкус"} +var defaultResponsible = []string{"Валерий Викторович Пякин:", "Валерий Викторович", "Пякин Валерий Викторович", "В.В. Пякин:", "Валерий"} + +func NewEntry(url *url.URL, cfg *config.Config) *Entry { + moderator := cfg.Questionanswer.Moderator + if condition := len(moderator) == 0; condition { + moderator = defaultModerator + } + responsible := cfg.Questionanswer.Responsible + if condition := len(responsible) == 0; condition { + responsible = defaultResponsible + } return &Entry{ - Url: url, + Url: url, + moderator: moderator, + responsible: responsible, } } @@ -146,9 +162,6 @@ func parseAvatarFile(avatarFile string) *url.URL { return u } -var moderator = []string{"Ведущий", "Ведущая:", "Дмитрий Таран:", "Сергей Будков:", "ВедущийЯ:", "Айнис Казимирович Петкус"} -var responsible = []string{"Валерий Викторович Пякин:", "Валерий Викторович", "Пякин Валерий Викторович", "В.В. Пякин:", "Валерий"} - // SplitIntoChunks разбивает текст на вопросы и ответы. // Он основан на поиске конкретных строк в тексте. // Если нашелся текст "Ведущий:", то он начинает добавлять текст в массив вопросов. @@ -173,9 +186,9 @@ func (e *Entry) SplitIntoChunks(els *goquery.Selection) { text := strings.TrimSpace(s.Text()) // Мы ищем текст "Ведущий:". - moderatorIndex, curModerator := checkStrIndex(text, moderator) + moderatorIndex, curModerator := checkStrIndex(text, e.moderator) // Мы ищем текст "Валерий Викторович Пякин:". - responsibleIndex, curResponsible := checkStrIndex(text, responsible) + responsibleIndex, curResponsible := checkStrIndex(text, e.responsible) // Если нашелся текст "Ведущий:", то мы начинаем новый вопрос. if moderatorIndex == 0 { @@ -253,7 +266,7 @@ func (e *Entry) splitAnswers() { // If not, add the answer without any changes. if startAnswer { // Check if the answer starts with "Валерий Викторович Пякин:". - responsibleIndex, _ := checkStrIndex(ans, responsible) + responsibleIndex, _ := checkStrIndex(ans, e.responsible) if responsibleIndex != 0 && strings.Index(ans, "") != 0 { // Add a tag to the answer. fragment.QuestionAnswer += fmt.Sprintf( From 6843e276ac940936ba8198d64b49850aad2fa8c9 Mon Sep 17 00:00:00 2001 From: agusev Date: Tue, 8 Oct 2024 17:44:55 +0300 Subject: [PATCH 2/2] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D1=8B=20=D0=B2=D0=B8=D1=81=D1=8F=D1=89=D0=B8=D0=B5?= =?UTF-8?q?=20=D0=BC=D0=BD=D0=BE=D0=B3=D0=BE=D1=82=D0=BE=D1=87=D0=B8=D1=8F?= =?UTF-8?q?=20=D0=B2=20=D1=84=D1=80=D0=B0=D0=B3=D0=BC=D0=B5=D0=BD=D1=82?= =?UTF-8?q?=D0=B0=D1=85=20=D0=92=D0=BE=D0=BF=D1=80=D0=BE=D1=81=20=E2=80=94?= =?UTF-8?q?=20=D0=9E=D1=82=D0=B2=D1=82=D0=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/parser/local.yaml | 15 +++++++++++++++ .../qaparser/questionanswer/questionanswer.go | 13 ++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/config/parser/local.yaml b/config/parser/local.yaml index 5d03572..57f45f5 100644 --- a/config/parser/local.yaml +++ b/config/parser/local.yaml @@ -6,6 +6,21 @@ manticore: - index: questions - index: questions_ext entry_chan_buffer: 20 +questionanswer: + moderator: + - "Ведущий" + - "Ведущая:" + - "Дмитрий Таран:" + - "Сергей Будков:" + - "ВедущийЯ:" + - "Айнис Казимирович Петкус" + responsible: + - "Валерий Викторович Пякин:" + - "Валерий Викторович" + - "Пякин Валерий Викторович" + - "В.В. Пякин:" + - "Валерий" + - "Викторович Валерий Пякин" splitter: opt_chunk_size: 1800 # оптимальный размер фрагмента контента для поиска, на эти фрагменты будет разбит контент max_chunk_size: 3600 # максимальный размер фрагмента контента для поиска diff --git a/internal/qaparser/questionanswer/questionanswer.go b/internal/qaparser/questionanswer/questionanswer.go index 50a5a24..772a0a7 100644 --- a/internal/qaparser/questionanswer/questionanswer.go +++ b/internal/qaparser/questionanswer/questionanswer.go @@ -269,11 +269,14 @@ func (e *Entry) splitAnswers() { responsibleIndex, _ := checkStrIndex(ans, e.responsible) if responsibleIndex != 0 && strings.Index(ans, "") != 0 { // Add a tag to the answer. - fragment.QuestionAnswer += fmt.Sprintf( - "

Валерий Викторович: … %v

", - ans, - ) - startAnswer = false + // Условие если ans не пустой, во избежание висящих … без текста, когда были вырезаны изображения из исходника. + if ans != "" { + fragment.QuestionAnswer += fmt.Sprintf( + "

Валерий Викторович: … %v

", + ans, + ) + startAnswer = false + } } else { // Add the answer without any changes. fragment.QuestionAnswer += fmt.Sprintf("

%v

", ans)