Skip to content

Commit

Permalink
Merge pull request #824 from samuelveigarangel/remove-duplicates-arti…
Browse files Browse the repository at this point in the history
…cles

Implementa remoção e visualização na interface administrativa de artigos duplicados.
  • Loading branch information
samuelveigarangel authored Jul 24, 2024
2 parents 1aa4724 + ded3720 commit 618e0a9
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 3 deletions.
4 changes: 4 additions & 0 deletions article/scripts/remove_duplicate_articles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from article import tasks

def run(username):
tasks.remove_duplicate_articles_task.apply_async(kwargs=dict(username=username))
32 changes: 31 additions & 1 deletion article/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
from datetime import datetime

from django.db.models import Q
from django.db.models import Q, Count
from django.contrib.auth import get_user_model
from django.utils.translation import gettext as _

Expand Down Expand Up @@ -241,3 +241,33 @@ def article_complete_data(
item.save()
except Article.DoesNotExist:
pass


def remove_duplicate_articles(pid_v3=None):
ids_to_exclude = []
try:
if pid_v3:
duplicates = Article.objects.filter(pid_v3=pid_v3).values("pid_v3").annotate(pid_v3_count=Count("pid_v3")).filter(pid_v3_count__gt=1)
else:
duplicates = Article.objects.values("pid_v3").annotate(pid_v3_count=Count("pid_v3")).filter(pid_v3_count__gt=1)
for duplicate in duplicates:
article_ids = Article.objects.filter(
pid_v3=duplicate["pid_v3"]
).order_by("created")[1:].values_list("id", flat=True)
ids_to_exclude.extend(article_ids)

if ids_to_exclude:
Article.objects.filter(id__in=ids_to_exclude).delete()
except Exception as exception:
exc_type, exc_value, exc_traceback = sys.exc_info()
UnexpectedEvent.create(
exception=exception,
exc_traceback=exc_traceback,
detail={
"task": "article.tasks.remove_duplicates_articles",
},
)

@celery_app.task(bind=True)
def remove_duplicate_articles_task(self, user_id=None, username=None, pid_v3=None):
remove_duplicate_articles(pid_v3)
42 changes: 41 additions & 1 deletion article/tests.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
from freezegun import freeze_time
from django.test import TestCase
from django_test_migrations.migrator import Migrator
from datetime import datetime
from django.utils.timezone import make_aware

from article.models import Article
from article.tasks import remove_duplicate_articles


class TestArticleMigration(TestCase):
def test_migration_0013_article_article_license(self):
Expand All @@ -17,4 +24,37 @@ def test_migration_0013_article_article_license(self):

article = Article.objects.first()
self.assertEqual(article.article_license, 'https://www.teste.com.br')
migrator.reset()
migrator.reset()


class RemoveDuplicateArticlesTest(TestCase):
def create_article_at_time(self, dt, v3):
@freeze_time(dt)
def create_article():
Article.objects.create(pid_v3=v3, created=make_aware(datetime.strptime(dt, "%Y-%m-%d")))
create_article()

def test_remove_duplicates_keeps_earliest_article(self):
self.create_article_at_time("2023-01-01", "pid1")
self.create_article_at_time("2023-01-02", "pid1")
self.create_article_at_time("2023-01-03", "pid1")
remove_duplicate_articles()
self.assertEqual(Article.objects.all().count(), 1)
self.assertEqual(Article.objects.all()[0].created, make_aware(datetime(2023, 1, 1)))

def test_no_removal_if_only_one_article(self):
self.create_article_at_time("2023-01-01", "pid1")
remove_duplicate_articles()
self.assertEqual(Article.objects.all().count(), 1)
self.assertEqual(Article.objects.all()[0].created, make_aware(datetime(2023, 1, 1)))

def test_remove_duplicates_for_multiple_pids(self):
self.create_article_at_time("2022-06-03", "pid2")
self.create_article_at_time("2022-06-04", "pid2")
self.create_article_at_time("2022-07-08", "pid3")
self.create_article_at_time("2022-06-14", "pid3")
remove_duplicate_articles()
self.assertEqual(Article.objects.filter(pid_v3="pid2").count(), 1)
self.assertEqual(Article.objects.filter(pid_v3="pid3").count(), 1)
self.assertEqual(Article.objects.get(pid_v3="pid2").created, make_aware(datetime(2022, 6, 3)))
self.assertEqual(Article.objects.get(pid_v3="pid3").created, make_aware(datetime(2022, 6, 14)))
30 changes: 29 additions & 1 deletion article/wagtail_hooks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from django.core.exceptions import PermissionDenied
from django.db.models import Count
from django.http import HttpResponseRedirect
from django.utils.translation import gettext as _
from wagtail.contrib.modeladmin.options import (
Expand All @@ -6,6 +8,8 @@
modeladmin_register,
)
from wagtail.contrib.modeladmin.views import CreateView
from wagtail.snippets.views.snippets import SnippetViewSet
from wagtail.snippets.models import register_snippet

from article.models import ( # AbstractModel,; Category,; Title,
Article,
Expand Down Expand Up @@ -34,7 +38,6 @@ class ArticleAdmin(ModelAdmin):
list_per_page = 20
list_display = (
"sps_pkg_name",
"doi",
"pid_v3",
"pid_v2",
"valid",
Expand Down Expand Up @@ -117,3 +120,28 @@ class ArticleAdminGroup(ModelAdminGroup):


modeladmin_register(ArticleAdminGroup)


class DuplicateArticlesViewSet(SnippetViewSet):
model = Article
icon = 'folder'
list_display = ["pid_v3", "updated", "created"]

def get_queryset(self, request):
if not request.user.is_superuser:
raise PermissionDenied
ids_duplicates = []
duplicates = Article.objects.all().values("pid_v3").annotate(pid_v3_count=Count("pid_v3")).filter(pid_v3_count__gt=1)

for duplicate in duplicates:
article_ids = Article.objects.filter(
pid_v3=duplicate['pid_v3']
).order_by("created")[1:].values_list("id", flat=True)
ids_duplicates.extend(article_ids)

if ids_duplicates:
return Article.objects.filter(id__in=ids_duplicates)
else:
return Article.objects.none()

register_snippet(DuplicateArticlesViewSet)
4 changes: 4 additions & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,7 @@ django-prometheus==2.3.1
# SciELO Legendarium
-e git+https://github.com/scieloorg/legendarium#egg=legendarium # https://github.com/scieloorg/legendarium


# freezegun
# ------------------------------------------------------------------------------
freezegun==1.5.1

0 comments on commit 618e0a9

Please sign in to comment.