Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3505 Update MD5-checksum-update - using pagination #126

Merged
merged 3 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@
@Repository
public interface ChromosomeRepository extends JpaRepository<ChromosomeEntity, Long> {

Page<ChromosomeEntity> findChromosomeEntitiesByInsdcAccession(String insdcAccession, Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByInsdcAccessionOrderByInsdcAccession(String insdcAccession, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByRefseq(String refseq, Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByRefseqOrderByInsdcAccession(String refseq, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByInsdcAccessionOrRefseq(String insdcAccession, String refseq, Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByInsdcAccessionOrRefseqOrderByInsdcAccession(String insdcAccession, String refseq, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_InsdcAccession(String asmInsdcAccession, Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_InsdcAccessionOrderByInsdcAccession(String asmInsdcAccession, Pageable request);

@Query("SELECT c FROM ChromosomeEntity c WHERE c.assembly.insdcAccession = :asmInsdcAccession AND (c.md5checksum IS NULL OR c.md5checksum = '')")
@Query("SELECT c FROM ChromosomeEntity c WHERE c.assembly.insdcAccession = :asmInsdcAccession AND (c.md5checksum IS NULL OR c.md5checksum = '') ORDER BY c.insdcAccession")
Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_InsdcAccessionAndMd5checksumIsNullOrEmpty(@Param("asmInsdcAccession") String asmInsdcAccession, Pageable pageable);

@Query("SELECT distinct c.assembly.insdcAccession FROM ChromosomeEntity c WHERE c.md5checksum IS NULL OR c.md5checksum = ''")
Expand All @@ -60,33 +60,33 @@ public interface ChromosomeRepository extends JpaRepository<ChromosomeEntity, Lo
@Query("DELETE FROM ChromosomeEntity c WHERE c.assembly.insdcAccession=:asmInsdcAccession")
void deleteChromosomeEntitiesByAssembly_InsdcAccession(@Param("asmInsdcAccession") String asmInsdcAccession);

Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_Refseq(String asmRefseq, Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_RefseqOrderByInsdcAccession(String asmRefseq, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(String genbankName, long asmTaxid, Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByGenbankSequenceNameAndAssembly_TaxidOrderByInsdcAccession(String genbankName, long asmTaxid, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByUcscNameAndAssembly_Taxid(String ucscName, long asmTaxid,
Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByUcscNameAndAssembly_TaxidOrderByInsdcAccession(String ucscName, long asmTaxid,
Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByEnaSequenceNameAndAssembly_Taxid(String enaName, long asmTaxid,
Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByEnaSequenceNameAndAssembly_TaxidOrderByInsdcAccession(String enaName, long asmTaxid,
Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByGenbankSequenceNameAndAssembly(String genbankName, AssemblyEntity assembly,
Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByGenbankSequenceNameAndAssemblyOrderByInsdcAccession(String genbankName, AssemblyEntity assembly,
Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByUcscNameAndAssembly(String ucscName, AssemblyEntity assembly,
Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByUcscNameAndAssemblyOrderByInsdcAccession(String ucscName, AssemblyEntity assembly,
Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByEnaSequenceNameAndAssembly(String enaName, AssemblyEntity assembly,
Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByEnaSequenceNameAndAssemblyOrderByInsdcAccession(String enaName, AssemblyEntity assembly,
Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByGenbankSequenceName(String genbankName, Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByGenbankSequenceNameOrderByInsdcAccession(String genbankName, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByEnaSequenceName(String enaSequenceName, Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByEnaSequenceNameOrderByInsdcAccession(String enaSequenceName, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseq(String insdcAccession, String refseq,
Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseqOrderByInsdcAccession(String insdcAccession, String refseq,
Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByUcscName(String ucscName, Pageable request);
Page<ChromosomeEntity> findChromosomeEntitiesByUcscNameOrderByInsdcAccession(String ucscName, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByMd5checksumOrderByInsdcAccession(String md5Checksum, Pageable request);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,65 +4,63 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.jdbc.core.ResultSetExtractor;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.stereotype.Component;
import org.springframework.web.client.RestTemplate;
import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;
import uk.ac.ebi.eva.contigalias.service.ChromosomeService;

import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

@Component
public class MD5ChecksumUpdater {
private final Logger logger = LoggerFactory.getLogger(MD5ChecksumUpdater.class);
private final int DEFAULT_BATCH_SIZE = 10000;
private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER";
private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata";
private RestTemplate restTemplate;
private final JdbcTemplate jdbcTemplate;
private final ChromosomeService chromosomeService;
private RestTemplate restTemplate;

@Autowired
public MD5ChecksumUpdater(RestTemplate restTemplate, JdbcTemplate jdbcTemplate, ChromosomeService chromosomeService) {
this.restTemplate = restTemplate;
this.jdbcTemplate = jdbcTemplate;
public MD5ChecksumUpdater(ChromosomeService chromosomeService, RestTemplate restTemplate) {
this.chromosomeService = chromosomeService;
this.restTemplate = restTemplate;
}

public void updateMD5ChecksumForAssembly(String assembly) {
public void updateMD5ChecksumForAssembly(String accession) {
logger.info("Start Update MD5 Checksum for assembly : " + accession);
try {
logger.info("Trying to update MD5 Checksum for assembly: " + assembly);
String sql = "select * from chromosome c where c.assembly_insdc_accession = '" + assembly
+ "' AND (c.md5checksum IS NULL OR c.md5checksum = '')";
jdbcTemplate.query(sql, (ResultSetExtractor<Void>) rs -> {
long chromosomeProcessed = 0;
List<ChromosomeEntity> chromosomeEntityList = new ArrayList<>();
while (rs.next()) {
ChromosomeEntity chromosome = new ChromosomeEntity();
chromosome.setInsdcAccession(rs.getString(1));
chromosomeEntityList.add(chromosome);
int pageNumber = 0;
Page<ChromosomeEntity> chrPage;
long chromosomeProcessed = 0;
long chromosomeUpdated = 0;
do {
Pageable pageable = PageRequest.of(pageNumber, DEFAULT_BATCH_SIZE);
chrPage = chromosomeService.getChromosomesByAssemblyAccession(accession, pageable);

if (chromosomeEntityList.size() == DEFAULT_BATCH_SIZE) {
updateMd5ChecksumForChromosome(assembly, chromosomeEntityList);
chromosomeProcessed += chromosomeEntityList.size();
logger.info("Chromosomes Processed till now: " + chromosomeProcessed);
chromosomeEntityList = new ArrayList<>();
}
}
if (chromosomeEntityList.size() > 0) {
updateMd5ChecksumForChromosome(assembly, chromosomeEntityList);
chromosomeProcessed += chromosomeEntityList.size();
logger.info("Chromosomes Processed till now: " + chromosomeProcessed);
List<ChromosomeEntity> chromosomeEntityList = chrPage.getContent();
List<ChromosomeEntity> chromosomeEntitiesWithoutMD5 = chromosomeEntityList.stream()
.filter(c -> c.getMd5checksum() == null || c.getMd5checksum().isEmpty())
.collect(Collectors.toList());

if(!chromosomeEntitiesWithoutMD5.isEmpty()){
updateMd5ChecksumForChromosome(accession, chromosomeEntityList);
}

logger.info("Finished updating MD5 Checksum for assembly: " + assembly);
chromosomeProcessed += chromosomeEntityList.size();
chromosomeUpdated += chromosomeEntitiesWithoutMD5.size();
logger.info("Chromosomes Processed till now: {}, selected for update till now: {}", chromosomeProcessed, chromosomeUpdated);

pageNumber++;
} while (chrPage.hasNext());

logger.info("Finished updating MD5 Checksum for assembly: " + accession);

return null;
});
} catch (Exception e) {
logger.error("Error while updating MD5 Checksum for assembly : " + assembly + "\n" + e);
logger.error("Error while updating MD5 Checksum for assembly : " + accession + "\n" + e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,17 @@ public ChromosomeService(ChromosomeRepository repository, JdbcTemplate jdbcTempl


public Page<ChromosomeEntity> getChromosomesByInsdcAccession(String insdcAccession, Pageable request) {
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByInsdcAccession(insdcAccession, request);
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByInsdcAccessionOrderByInsdcAccession(insdcAccession, request);
return stripChromosomesAndScaffoldsFromAssembly(chromosomes);
}

public Page<ChromosomeEntity> getChromosomesByRefseq(String refseq, Pageable request) {
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByRefseq(refseq, request);
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByRefseqOrderByInsdcAccession(refseq, request);
return stripChromosomesAndScaffoldsFromAssembly(chromosomes);
}

public Page<ChromosomeEntity> getChromosomesByAssemblyInsdcAccession(String asmInsdcAccession, Pageable request) {
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByAssembly_InsdcAccession(asmInsdcAccession, request);
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByAssembly_InsdcAccessionOrderByInsdcAccession(asmInsdcAccession, request);
return stripAssembliesFromChromosomes(chromosomes);
}

Expand Down Expand Up @@ -84,17 +84,17 @@ public void updateENASequenceNameForAllChromosomeInAssembly(String assembly, Lis
}

public Page<ChromosomeEntity> getChromosomesByAssemblyRefseq(String asmRefseq, Pageable request) {
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByAssembly_Refseq(asmRefseq, request);
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByAssembly_RefseqOrderByInsdcAccession(asmRefseq, request);
return stripAssembliesFromChromosomes(chromosomes);
}

public List<AssemblyEntity> getAssembliesByChromosomeInsdcAccession(String chrInsdcAccession) {
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByInsdcAccession(chrInsdcAccession, Pageable.unpaged());
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByInsdcAccessionOrderByInsdcAccession(chrInsdcAccession, Pageable.unpaged());
return extractAssembliesFromChromosomes(page);
}

public List<AssemblyEntity> getAssembliesByChromosomeRefseq(String chrRefseq) {
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByRefseq(chrRefseq, Pageable.unpaged());
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByRefseqOrderByInsdcAccession(chrRefseq, Pageable.unpaged());
return extractAssembliesFromChromosomes(page);
}

Expand All @@ -111,64 +111,64 @@ public List<AssemblyEntity> extractAssembliesFromChromosomes(Page<ChromosomeEnti
}

public Page<ChromosomeEntity> getChromosomesByName(String name, Pageable request) {
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByGenbankSequenceName(name, request);
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByGenbankSequenceNameOrderByInsdcAccession(name, request);
return stripChromosomesAndScaffoldsFromAssembly(page);
}

public Page<ChromosomeEntity> getChromosomesByNameAndAssemblyTaxid(String name, long asmTaxid, Pageable request) {
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(name, asmTaxid, request);
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssembly_TaxidOrderByInsdcAccession(name, asmTaxid, request);
return stripChromosomesAndScaffoldsFromAssembly(page);
}

public Page<ChromosomeEntity> getChromosomesByNameAndAssembly(
String name, AssemblyEntity assembly, Pageable request) {
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssembly(name, assembly, request);
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssemblyOrderByInsdcAccession(name, assembly, request);
assembly.setChromosomes(null);
return injectAssemblyIntoChromosomes(page, assembly);
}

public Page<ChromosomeEntity> getChromosomesByAssemblyAccession(String accession, Pageable request) {
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseq(
Page<ChromosomeEntity> chromosomes = repository.findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseqOrderByInsdcAccession(
accession, accession, request);
return stripAssembliesFromChromosomes(chromosomes);
}

public Page<ChromosomeEntity> getChromosomesByUcscName(String ucscName, Pageable request) {
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByUcscName(ucscName, request);
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByUcscNameOrderByInsdcAccession(ucscName, request);
return stripChromosomesAndScaffoldsFromAssembly(page);
}

public Page<ChromosomeEntity> getChromosomesByUcscNameAndAssemblyTaxid(
String ucscName, long asmTaxid, Pageable request) {
Page<ChromosomeEntity> page
= repository.findChromosomeEntitiesByUcscNameAndAssembly_Taxid(ucscName, asmTaxid, request);
= repository.findChromosomeEntitiesByUcscNameAndAssembly_TaxidOrderByInsdcAccession(ucscName, asmTaxid, request);
return stripChromosomesAndScaffoldsFromAssembly(page);
}

public Page<ChromosomeEntity> getChromosomesByUcscNameAndAssembly(String ucscName, AssemblyEntity assembly,
Pageable request) {
Page<ChromosomeEntity> page
= repository.findChromosomeEntitiesByUcscNameAndAssembly(ucscName, assembly, request);
= repository.findChromosomeEntitiesByUcscNameAndAssemblyOrderByInsdcAccession(ucscName, assembly, request);
assembly.setChromosomes(null);
return injectAssemblyIntoChromosomes(page, assembly);
}

public Page<ChromosomeEntity> getChromosomesByEnaName(String enaName, Pageable request) {
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByEnaSequenceName(enaName, request);
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByEnaSequenceNameOrderByInsdcAccession(enaName, request);
return stripChromosomesAndScaffoldsFromAssembly(page);
}

public Page<ChromosomeEntity> getChromosomesByEnaNameAndAssemblyTaxid(
String enaName, long asmTaxid, Pageable request) {
Page<ChromosomeEntity> page
= repository.findChromosomeEntitiesByEnaSequenceNameAndAssembly_Taxid(enaName, asmTaxid, request);
= repository.findChromosomeEntitiesByEnaSequenceNameAndAssembly_TaxidOrderByInsdcAccession(enaName, asmTaxid, request);
return stripChromosomesAndScaffoldsFromAssembly(page);
}

public Page<ChromosomeEntity> getChromosomesByEnaNameAndAssembly(
String enaName, AssemblyEntity assembly, Pageable request) {
Page<ChromosomeEntity> page
= repository.findChromosomeEntitiesByEnaSequenceNameAndAssembly(enaName, assembly, request);
= repository.findChromosomeEntitiesByEnaSequenceNameAndAssemblyOrderByInsdcAccession(enaName, assembly, request);
assembly.setChromosomes(null);
return injectAssemblyIntoChromosomes(page, assembly);
}
Expand Down Expand Up @@ -211,7 +211,7 @@ private void stripAssemblyFromChromosome(ChromosomeEntity chromosome) {
}

public void putChromosomeChecksumsByAccession(String accession, String md5, String trunc512) {
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByInsdcAccessionOrRefseq(
Page<ChromosomeEntity> page = repository.findChromosomeEntitiesByInsdcAccessionOrRefseqOrderByInsdcAccession(
accession, accession, Pageable.unpaged());
if (page.isEmpty()) {
throw new IllegalArgumentException(
Expand Down
Loading
Loading