diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/NpOpenSourceCollectorApplication.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/NpOpenSourceCollectorApplication.java index f42d55f..452b22e 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/NpOpenSourceCollectorApplication.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/NpOpenSourceCollectorApplication.java @@ -50,11 +50,11 @@ public void run(String... args) throws Exception { //cleaning the DB before filling it - //mongoTemplate.getDb().drop(); + mongoTemplate.getDb().drop(); - System.out.println("Code version from 20 August 2019"); + System.out.println("Code version from 23rd september 2019"); if (args.length > 0) { String dataDirectory = args[0]; @@ -87,10 +87,10 @@ public void run(String... args) throws Exception { - updaterService.updateSourceNaturalProducts(); //compute similarities between natural products similarityComputationService.generateAllPairs(); - similarityComputationService.computeSimilarities(); + //similarityComputationService.computeSimilarities(); + similarityComputationService.doParallelizedWork(); diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/misc/DatabaseTypeChecker.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/misc/DatabaseTypeChecker.java new file mode 100644 index 0000000..e206c26 --- /dev/null +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/misc/DatabaseTypeChecker.java @@ -0,0 +1,95 @@ +package de.unijena.cheminf.npopensourcecollector.misc; + +import org.springframework.stereotype.Service; + +import java.util.Arrays; +import java.util.HashSet; + + +@Service +public class DatabaseTypeChecker { + + + + + private final String[] africa = {"afrodb", "afrocancer", "afromalariadb", "afrotryp", "conmednp", "etm", "mitishamba", "nanpdb", "p-anapl", "sancdb"}; + private final String[] china = {"him", "hit", "tcmdb_taiwan", "tcmid", "tipdb"}; + private final String[] india = {"imppat", "inpacdb"}; + private final String[] europe = {"tppt"}; + private final String[] america = {"nubbedb", "uefs", "biofacquim"}; + + private final HashSet continentAfrica = new HashSet(Arrays.asList(africa)); + private final HashSet continentIndia = new HashSet(Arrays.asList(india)); + private final HashSet continentChina = new HashSet(Arrays.asList(china)); + private final HashSet continentEurope = new HashSet(Arrays.asList(europe)); + private final HashSet continentAmerica = new HashSet(Arrays.asList(america)); + + + + private final String[] plants = {"uefs","tppt","tmdb","tipdb","tcmid", "tcmdb_taiwan","spektraris","sancdb", + "respect","p-anapl", "npact","nanpdb","mitishamba","inpacdb","imppat", "hit","him","etm","conmednp", + "afrotryp", "afromalariadb","afrocancer","afrodb"}; + private final String[] bacteria = {"streptomedb"}; + private final String[] fungi = {"lichendatabase"}; + private final String[] animals = {}; + private final String[] marine = {}; + private final String[] mixed = {"nubbedb","npcare","npatlas","npass","analyticon_all_np", "biofacquim"}; + + private final HashSet taxPlants = new HashSet(Arrays.asList(plants)); + private final HashSet taxBacteria = new HashSet(Arrays.asList(bacteria)); + private final HashSet taxFungi = new HashSet(Arrays.asList(fungi)); + private final HashSet taxAnimals = new HashSet(Arrays.asList(animals)); + private final HashSet taxMarine = new HashSet(Arrays.asList(marine)); + private final HashSet taxMixed = new HashSet(Arrays.asList(mixed)); + + + public String checkContinent(String sourceDB){ + + if(continentAfrica.contains(sourceDB)){ + return "africa"; + } + else if(continentChina.contains(sourceDB)){ + return "china"; + } + else if(continentIndia.contains(sourceDB)){ + return "india"; + } + else if(continentEurope.contains(sourceDB)){ + return "europe"; + } + else if(continentAmerica.contains(sourceDB)){ + return "southamerica"; + } + else { + return "nogeo"; + } + } + + + public String checkKingdom(String sourceDB){ + if(taxPlants.contains(sourceDB)){ + return "plants"; + } + else if(taxBacteria.contains(sourceDB)){ + return "bacteria"; + } + else if(taxAnimals.contains(sourceDB)){ + return "animals"; + } + else if(taxFungi.contains(sourceDB)){ + return "fungi"; + } + else if(taxMarine.contains(sourceDB)){ + return "marine"; + } + else if(taxMixed.contains(sourceDB)){ + return "mixed"; + } + else{ + return "notax"; + } + + } + + +} diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/misc/MoleculeChecker.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/misc/MoleculeChecker.java index 9ac79ff..558f2f7 100755 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/misc/MoleculeChecker.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/misc/MoleculeChecker.java @@ -108,7 +108,6 @@ public IAtomContainer checkMolecule(IAtomContainer molecule){ } - //Remove aromaticity String smi; SmilesGenerator sg = new SmilesGenerator(SmiFlavor.Unique); SmilesParser sp = new SmilesParser(DefaultChemObjectBuilder.getInstance()); diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPDatabase.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPDatabase.java new file mode 100644 index 0000000..a9fe82b --- /dev/null +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPDatabase.java @@ -0,0 +1,66 @@ +package de.unijena.cheminf.npopensourcecollector.mongocollections; + +import org.springframework.data.annotation.Id; +import org.springframework.data.mongodb.core.mapping.Document; + +@Document +public class NPDatabase { + + @Id + public String id; + + String name; + + String localFileName; + + String url; + + String comments; + + Integer nb_unique_molecules; + + + public String getId() { + return id; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getComments() { + return comments; + } + + public void setComments(String comments) { + this.comments = comments; + } + + public Integer getNb_unique_molecules() { + return nb_unique_molecules; + } + + public void setNb_unique_molecules(Integer nb_unique_molecules) { + this.nb_unique_molecules = nb_unique_molecules; + } + + public String getLocalFileName() { + return localFileName; + } + + public void setLocalFileName(String localFileName) { + this.localFileName = localFileName; + } +} diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPDatabaseRepository.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPDatabaseRepository.java new file mode 100644 index 0000000..7bf9126 --- /dev/null +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPDatabaseRepository.java @@ -0,0 +1,6 @@ +package de.unijena.cheminf.npopensourcecollector.mongocollections; + +import org.springframework.data.mongodb.repository.MongoRepository; + +public interface NPDatabaseRepository extends MongoRepository { +} diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPSimilarity.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPSimilarity.java index 1c4d677..8f55943 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPSimilarity.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/NPSimilarity.java @@ -9,9 +9,9 @@ public class NPSimilarity { @Id public String id; - public UniqueNaturalProduct uniqueNaturalProduct1; + public String uniqueNaturalProductID1; - public UniqueNaturalProduct uniqueNaturalProduct2; + public String uniqueNaturalProductID2; public Double tanimoto; @@ -26,24 +26,24 @@ public void setId(String id) { this.id = id; } - public UniqueNaturalProduct getUniqueNaturalProduct1() { - return uniqueNaturalProduct1; + public Double getTanimoto() { + return tanimoto; } - public void setUniqueNaturalProduct1(UniqueNaturalProduct uniqueNaturalProduct1) { - this.uniqueNaturalProduct1 = uniqueNaturalProduct1; + public String getUniqueNaturalProductID1() { + return uniqueNaturalProductID1; } - public UniqueNaturalProduct getUniqueNaturalProduct2() { - return uniqueNaturalProduct2; + public void setUniqueNaturalProductID1(String uniqueNaturalProductID1) { + this.uniqueNaturalProductID1 = uniqueNaturalProductID1; } - public void setUniqueNaturalProduct2(UniqueNaturalProduct uniqueNaturalProduct2) { - this.uniqueNaturalProduct2 = uniqueNaturalProduct2; + public String getUniqueNaturalProductID2() { + return uniqueNaturalProductID2; } - public Double getTanimoto() { - return tanimoto; + public void setUniqueNaturalProductID2(String uniqueNaturalProductID2) { + this.uniqueNaturalProductID2 = uniqueNaturalProductID2; } public void setTanimoto(Double tanimoto) { diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/SourceNaturalProduct.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/SourceNaturalProduct.java index 2b07176..47c5566 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/SourceNaturalProduct.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/SourceNaturalProduct.java @@ -4,6 +4,8 @@ import org.springframework.data.mongodb.core.index.Indexed; import org.springframework.data.mongodb.core.mapping.Document; +import java.util.ArrayList; + @Document public class SourceNaturalProduct { @@ -11,6 +13,7 @@ public class SourceNaturalProduct { @Id public String id; + @Indexed public String source; public String originalSmiles; @@ -21,8 +24,10 @@ public class SourceNaturalProduct { public String originalInchiKey; + @Indexed public String simpleInchi; + @Indexed public String simpleInchiKey; public Integer heavyAtomNumber; @@ -37,6 +42,20 @@ public class SourceNaturalProduct { public UniqueNaturalProduct uniqueNaturalProduct; + public ArrayList citation; + + public ArrayList taxid; + + public ArrayList organismText; + + public String continent; + + public ArrayList geographicLocation; + + public String name; + + public ArrayList synonyms; + @@ -162,4 +181,60 @@ public UniqueNaturalProduct getUniqueNaturalProduct() { public void setUniqueNaturalProduct(UniqueNaturalProduct uniqueNaturalProduct) { this.uniqueNaturalProduct = uniqueNaturalProduct; } + + public ArrayList getCitation() { + return citation; + } + + public void setCitation(ArrayList citation) { + this.citation = citation; + } + + public ArrayList getTaxid() { + return taxid; + } + + public void setTaxid(ArrayList taxid) { + this.taxid = taxid; + } + + public String getContinent() { + return continent; + } + + public void setContinent(String continent) { + this.continent = continent; + } + + public ArrayList getOrganismText() { + return organismText; + } + + public void setOrganismText(ArrayList organismText) { + this.organismText = organismText; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public ArrayList getSynonyms() { + return synonyms; + } + + public void setSynonyms(ArrayList synonyms) { + this.synonyms = synonyms; + } + + public ArrayList getGeographicLocation() { + return geographicLocation; + } + + public void setGeographicLocation(ArrayList geographicLocation) { + this.geographicLocation = geographicLocation; + } } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/UniqueNaturalProduct.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/UniqueNaturalProduct.java index cb4511a..9ae3d24 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/UniqueNaturalProduct.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/mongocollections/UniqueNaturalProduct.java @@ -20,20 +20,36 @@ public class UniqueNaturalProduct { public Integer heavy_atom_number; + @Indexed public String inchi; + @Indexed public String inchikey; + @Indexed public String smiles; + @Indexed public String clean_smiles; + @Indexed public String molecular_formula; public Double molecular_weight; + + public ArrayList citationDOI; + + public ArrayList taxid; + + public ArrayList continent; + + @Indexed + public String name; + + public Double npl_noh_score; public Double npl_score; @@ -153,7 +169,6 @@ public class UniqueNaturalProduct { - //TODO add additional features //TODO consider MDEDescriptor //TODO consider WHIMDescriptor @@ -623,4 +638,37 @@ public String getClean_smiles() { public void setClean_smiles(String clean_smiles) { this.clean_smiles = clean_smiles; } + + + public ArrayList getCitationDOI() { + return citationDOI; + } + + public void setCitationDOI(ArrayList citationDOI) { + this.citationDOI = citationDOI; + } + + public ArrayList getTaxid() { + return taxid; + } + + public void setTaxid(ArrayList taxid) { + this.taxid = taxid; + } + + public ArrayList getContinent() { + return continent; + } + + public void setContinent(ArrayList continent) { + this.continent = continent; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/CSVReader.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/CSVReader.java new file mode 100644 index 0000000..6ea3e52 --- /dev/null +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/CSVReader.java @@ -0,0 +1,337 @@ +package de.unijena.cheminf.npopensourcecollector.readers; + +import de.unijena.cheminf.npopensourcecollector.misc.BeanUtil; +import de.unijena.cheminf.npopensourcecollector.misc.DatabaseTypeChecker; +import de.unijena.cheminf.npopensourcecollector.misc.MoleculeChecker; +import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProduct; +import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProductRepository; +import de.unijena.cheminf.npopensourcecollector.services.AtomContainerToSourceNaturalProductService; +import net.sf.jniinchi.INCHI_OPTION; +import net.sf.jniinchi.INCHI_RET; +import org.openscience.cdk.DefaultChemObjectBuilder; +import org.openscience.cdk.exception.CDKException; +import org.openscience.cdk.exception.InvalidSmilesException; +import org.openscience.cdk.inchi.InChIGenerator; +import org.openscience.cdk.inchi.InChIGeneratorFactory; +import org.openscience.cdk.inchi.InChIToStructure; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IBond; +import org.openscience.cdk.smiles.SmiFlavor; +import org.openscience.cdk.smiles.SmilesGenerator; +import org.openscience.cdk.smiles.SmilesParser; + +import java.io.*; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class CSVReader implements Reader { + + + File file; + ArrayList listOfMolecules; + private LineNumberReader inchiReader; + SourceNaturalProductRepository sourceNaturalProductRepository; + AtomContainerToSourceNaturalProductService ac2snp; + MoleculeChecker moleculeChecker; + DatabaseTypeChecker databaseTypeChecker; + String source; + + public CSVReader(){ + this.listOfMolecules = new ArrayList(); + sourceNaturalProductRepository = BeanUtil.getBean(SourceNaturalProductRepository.class); + ac2snp = BeanUtil.getBean(AtomContainerToSourceNaturalProductService.class); + moleculeChecker = BeanUtil.getBean(MoleculeChecker.class); + databaseTypeChecker = BeanUtil.getBean(DatabaseTypeChecker.class); + } + + + @Override + public void readFile(File file) { + + SmilesGenerator smilesGenerator = new SmilesGenerator(SmiFlavor.Unique ); + SmilesParser sp = new SmilesParser(DefaultChemObjectBuilder.getInstance()); + this.file = file; + this.source = file.getName().toLowerCase().replace(".csv", ""); + + + try { + BufferedReader bufferedReader = new BufferedReader(new FileReader(this.file)); + //read the header + + // if the first line is the header + ArrayList header = new ArrayList(Arrays.asList( bufferedReader.readLine().split(","))); + //System.out.println(header); + + Integer indexOfID=null; + Integer indexOfName=null; + Integer indexOfReference=null; + Integer indexOfCitation=null; + Integer indexOfDOI=null; + Integer indexOfSMILES=null; + Integer indexOfInchi=null; + Integer indexOfInchikey=null; + Integer indexOfKingdom=null; + Integer indexOfGenus=null; + Integer indexOfSpecies=null; + Integer indexOfGeo=null; + Integer indexOfCode=null; + + for(String item : header){ + + if( item.toLowerCase().equals("id") || item.toLowerCase().equals("identifier") ){ + indexOfID = header.indexOf(item); + } + if(item.toLowerCase().contains("name")){ + indexOfName = header.indexOf(item); + } + if(item.toLowerCase().contains("ref")){ + indexOfReference = header.indexOf(item); + } + if(item.toLowerCase().contains("citation")){ + indexOfCitation = header.indexOf(item); + } + if(item.toLowerCase().contains("doi")){ + indexOfDOI = header.indexOf(item); + } + if(item.toLowerCase().contains("smiles")){ + indexOfSMILES = header.indexOf(item); + } + if(item.toLowerCase().contains("inchi") && !item.toLowerCase().contains("inchikey")){ + indexOfInchi = header.indexOf(item); + } + if(item.toLowerCase().contains("inchikey")){ + indexOfInchikey = header.indexOf(item); + } + if(item.toLowerCase().contains("kingdom")){ + indexOfKingdom = header.indexOf(item); + } + if(item.toLowerCase().contains("genu")){ + indexOfGenus = header.indexOf(item); + } + if(item.toLowerCase().contains("specie")){ + indexOfSpecies = header.indexOf(item); + } + if(item.toLowerCase().contains("geo") || item.toLowerCase().contains("site") || item.toLowerCase().contains("local")){ + indexOfGeo = header.indexOf(item); + } + if( item.toLowerCase().contains("code") || item.toLowerCase().contains(this.source) ){ + indexOfCode = header.indexOf(item); + } + + } + + + if(indexOfID== null && indexOfCode != null){ + indexOfID = indexOfCode; + } + + + //read the rest of the file + int count = 1; + String line; + + while ((line = bufferedReader.readLine()) != null && count <= 600000) { + + ArrayList dataline = new ArrayList(Arrays.asList( line.split(","))); + try{ + + IAtomContainer molecule = null; + + if(indexOfSMILES != null){ + molecule = sp.parseSmiles(dataline.get(indexOfSMILES)); + + molecule.setProperty("FILE_ORIGIN", file.getName().replace(".csv", "")); + molecule.setProperty("SOURCE", source); + molecule.setProperty("ORIGINAL_SMILES", dataline.get(indexOfSMILES)); + + + if(indexOfInchi != null){ + molecule.setProperty("ORIGINAL_INCHI", dataline.get(indexOfInchi)); + + } + if(indexOfInchikey != null){ + molecule.setProperty("ORIGINAL_INCHIKEY", dataline.get(indexOfInchikey)); + } + + }else if(indexOfInchi != null){ + // READING InCHI + InChIGeneratorFactory factory = InChIGeneratorFactory.getInstance(); + InChIToStructure intostruct = factory.getInChIToStructure(dataline.get(indexOfInchi), DefaultChemObjectBuilder.getInstance()); + INCHI_RET ret = intostruct.getReturnStatus(); + if (ret == INCHI_RET.WARNING) { + // Structure generated, but with warning message + System.out.println("InChI warning: " + intostruct.getMessage()); + } else if (ret != INCHI_RET.OKAY) { + // Structure generation failed + throw new CDKException("Structure generation failed failed: " + ret.toString() + " [" + intostruct.getMessage() + "]"); + } + + molecule = intostruct.getAtomContainer(); + if(indexOfInchikey != null){ + molecule.setProperty("ORIGINAL_INCHIKEY", dataline.get(indexOfInchikey)); + } + } + + if(molecule != null){ + if(indexOfID != null){ + molecule.setID(dataline.get(indexOfID)); + molecule.setProperty("ID", dataline.get(indexOfID)); + } + else if(indexOfName != null){ + molecule.setID(dataline.get(indexOfName)); + molecule.setProperty("ID", dataline.get(indexOfName)); + } + else{ + molecule.setID(Integer.toString(count)); + molecule.setProperty("ID", Integer.toString(count)); + } + + molecule = moleculeChecker.checkMolecule(molecule); + + if (molecule != null){ + try { + List options = new ArrayList(); + options.add(INCHI_OPTION.SNon); + options.add(INCHI_OPTION.ChiralFlagOFF); + options.add(INCHI_OPTION.AuxNone); + InChIGenerator gen = InChIGeneratorFactory.getInstance().getInChIGenerator(molecule, options ); + + molecule.setProperty("SIMPLE_INCHI", gen.getInchi()); + molecule.setProperty("SIMPLE_INCHIKEY", gen.getInchiKey()); + + + } catch (CDKException e) { + Integer totalBonds = molecule.getBondCount(); + Integer ib = 0; + while (ib < totalBonds) { + + IBond b = molecule.getBond(ib); + if (b.getOrder() == IBond.Order.UNSET) { + b.setOrder(IBond.Order.SINGLE); + + } + ib++; + } + List options = new ArrayList(); + options.add(INCHI_OPTION.SNon); + options.add(INCHI_OPTION.ChiralFlagOFF); + options.add(INCHI_OPTION.AuxNone); + InChIGenerator gen = InChIGeneratorFactory.getInstance().getInChIGenerator(molecule, options ); + + molecule.setProperty("SIMPLE_INCHI", gen.getInchi()); + molecule.setProperty("SIMPLE_INCHIKEY", gen.getInchiKey()); + + } + + molecule.setProperty("SIMPLE_SMILES", smilesGenerator.create(molecule)); + + + DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy/MM/dd"); + LocalDate localDate = LocalDate.now(); + + molecule.setProperty("ACQUISITION_DATE", dtf.format(localDate)); + + + SourceNaturalProduct sourceNaturalProduct = ac2snp.createSNPlInstance(molecule); + + String taxa = databaseTypeChecker.checkKingdom(this.source); + if(taxa.equals("mixed")){ + //do things db by db + if(source.equals("nubbedb")){ + //there is a p at the beginning of each id for plants + if(molecule.getID().startsWith("p.")){ + taxa = "plants"; + }else{ + taxa="animals"; + } + } + else if(source.equals("npatlas")){ + if(molecule.getID().startsWith("b")){ + taxa = "bacteria"; + }else{ + taxa="fungi"; + } + } + else if(source.equals("biofacquim")){ + taxa = dataline.get(indexOfKingdom); + } + else{ + taxa="notax"; + } + } + sourceNaturalProduct.setOrganismText(new ArrayList()); + sourceNaturalProduct.organismText.add(taxa); + + if(indexOfKingdom != null){ + sourceNaturalProduct.organismText.add(dataline.get(indexOfKingdom)); + } + if(indexOfGenus != null){ + sourceNaturalProduct.organismText.add(dataline.get(indexOfGenus)); + } + if(indexOfSpecies != null){ + sourceNaturalProduct.organismText.add(dataline.get(indexOfSpecies)); + } + + //GEOGRAPHY + sourceNaturalProduct.setContinent(databaseTypeChecker.checkContinent(this.source)); + if(indexOfGeo != null){ + sourceNaturalProduct.geographicLocation = new ArrayList<>(); + sourceNaturalProduct.geographicLocation.add(dataline.get(indexOfGeo)); + } + + //citation reference and doi + if(indexOfCitation != null || indexOfDOI != null || indexOfReference != null){ + sourceNaturalProduct.citation = new ArrayList<>(); + if(indexOfCitation != null){ + sourceNaturalProduct.citation.add(dataline.get(indexOfCitation)); + } + if(indexOfDOI != null){ + sourceNaturalProduct.citation.add(dataline.get(indexOfDOI)); + } + if(indexOfReference != null){ + sourceNaturalProduct.citation.add(dataline.get(indexOfReference)); + } + } + + if(!moleculeChecker.isForbiddenMolecule(molecule)){ + sourceNaturalProductRepository.save(sourceNaturalProduct); + } + } + } + else{ + System.out.println("No molecular structure detected"); + } + + + + + } catch (CDKException e ) { + e.printStackTrace(); + System.out.println(line); + } + + count++; + } + + + } catch (IOException e ) { + e.printStackTrace(); + } + + + + } + + @Override + public ArrayList returnCorrectMolecules() { + return this.listOfMolecules; + } + + @Override + public String returnSource() { + return this.source; + } +} diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/InChiReader.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/InChiReader.java index 970428f..f54c631 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/InChiReader.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/InChiReader.java @@ -1,7 +1,9 @@ package de.unijena.cheminf.npopensourcecollector.readers; import de.unijena.cheminf.npopensourcecollector.misc.BeanUtil; +import de.unijena.cheminf.npopensourcecollector.misc.DatabaseTypeChecker; import de.unijena.cheminf.npopensourcecollector.misc.MoleculeChecker; +import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProduct; import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProductRepository; import de.unijena.cheminf.npopensourcecollector.services.AtomContainerToSourceNaturalProductService; import net.sf.jniinchi.INCHI_OPTION; @@ -29,6 +31,7 @@ public class InChiReader implements Reader { SourceNaturalProductRepository sourceNaturalProductRepository; AtomContainerToSourceNaturalProductService ac2snp; MoleculeChecker moleculeChecker; + DatabaseTypeChecker databaseTypeChecker; String source; public InChiReader(){ @@ -36,6 +39,7 @@ public InChiReader(){ sourceNaturalProductRepository = BeanUtil.getBean(SourceNaturalProductRepository.class); ac2snp = BeanUtil.getBean(AtomContainerToSourceNaturalProductService.class); moleculeChecker = BeanUtil.getBean(MoleculeChecker.class); + databaseTypeChecker = BeanUtil.getBean(DatabaseTypeChecker.class); } @@ -151,8 +155,37 @@ public void readFile(File file) { molecule.setProperty("ACQUISITION_DATE", dtf.format(localDate)); + SourceNaturalProduct sourceNaturalProduct = ac2snp.createSNPlInstance(molecule); + + sourceNaturalProduct.setContinent(databaseTypeChecker.checkContinent(this.source)); + + String taxa = databaseTypeChecker.checkKingdom(this.source); + if(taxa.equals("mixed")){ + //do things db by db + if(source.equals("nubbedb")){ + //there is a p at the beginning of each id for plants + if(molecule.getID().startsWith("p.")){ + taxa = "plants"; + }else{ + taxa="animals"; + } + } + else if(source.equals("npatlas")){ + if(molecule.getID().startsWith("b")){ + taxa = "bacteria"; + }else{ + taxa="fungi"; + } + } + else{ + taxa="notax"; + } + } + sourceNaturalProduct.setOrganismText(new ArrayList()); + sourceNaturalProduct.organismText.add(taxa); + if(!moleculeChecker.isForbiddenMolecule(molecule)){ - sourceNaturalProductRepository.save(ac2snp.createSNPlInstance(molecule)); + sourceNaturalProductRepository.save(sourceNaturalProduct); } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/MOLReader.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/MOLReader.java index 2d4e47b..d36d431 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/MOLReader.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/MOLReader.java @@ -1,7 +1,9 @@ package de.unijena.cheminf.npopensourcecollector.readers; import de.unijena.cheminf.npopensourcecollector.misc.BeanUtil; +import de.unijena.cheminf.npopensourcecollector.misc.DatabaseTypeChecker; import de.unijena.cheminf.npopensourcecollector.misc.MoleculeChecker; +import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProduct; import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProductRepository; import de.unijena.cheminf.npopensourcecollector.services.AtomContainerToSourceNaturalProductService; import net.sf.jniinchi.INCHI_OPTION; @@ -12,8 +14,11 @@ import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; import org.openscience.cdk.io.iterator.IteratingSDFReader; +import org.openscience.cdk.silent.SilentChemObjectBuilder; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; +import org.openscience.cdk.tools.CDKHydrogenAdder; +import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import java.io.File; import java.io.FileInputStream; @@ -35,6 +40,7 @@ public class MOLReader implements Reader { MoleculeChecker moleculeChecker; + DatabaseTypeChecker databaseTypeChecker; String source; @@ -43,6 +49,7 @@ public MOLReader(){ sourceNaturalProductRepository = BeanUtil.getBean(SourceNaturalProductRepository.class); ac2snp = BeanUtil.getBean(AtomContainerToSourceNaturalProductService.class); moleculeChecker = BeanUtil.getBean(MoleculeChecker.class); + databaseTypeChecker = BeanUtil.getBean(DatabaseTypeChecker.class); } @@ -72,7 +79,7 @@ public void readFile(File file) { IAtomContainer molecule = reader.next(); molecule.setProperty("MOL_NUMBER_IN_FILE", Integer.toString(count)); - molecule.setProperty("FILE_ORIGIN", file.getName().replace(".sdf", "")); + molecule.setProperty("FILE_ORIGIN", file.getName().replace(".mol", "")); molecule.setProperty("SOURCE", source); @@ -83,21 +90,32 @@ public void readFile(File file) { boolean foundOriginalSmiles = false; molecule.setProperty("ORIGINAL_INCHI", ""); molecule.setProperty("ORIGINAL_INCHIKEY", ""); + + //trick to avoid having a molecule without even implicit hydrogens + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(molecule); + CDKHydrogenAdder adder = + CDKHydrogenAdder.getInstance(SilentChemObjectBuilder.getInstance()); + adder.addImplicitHydrogens(molecule); + + IAtomContainer tmpMolecule = molecule.clone(); for(Object p : molecule.getProperties().keySet()){ if(p.toString().toLowerCase().contains("smiles")){ - molecule.setProperty("ORIGINAL_SMILES", molecule.getProperty(p)); + tmpMolecule.setProperty("ORIGINAL_SMILES", molecule.getProperty(p)); foundOriginalSmiles = true; } if(p.toString().toLowerCase().contains("inchi") && !(p.toString().toLowerCase().contains("inchikey") || p.toString().toLowerCase().contains("inchi_key")) ){ - molecule.setProperty("ORIGINAL_INCHI", molecule.getProperty(p)); + tmpMolecule.setProperty("ORIGINAL_INCHI", molecule.getProperty(p)); } if(p.toString().toLowerCase().contains("inchikey") || p.toString().toLowerCase().contains("inchi_key")){ - molecule.setProperty("ORIGINAL_INCHIKEY", molecule.getProperty(p)); + tmpMolecule.setProperty("ORIGINAL_INCHIKEY", molecule.getProperty(p)); } } + molecule = tmpMolecule; + + if(!foundOriginalSmiles) { molecule.setProperty("ORIGINAL_SMILES", smilesGenerator.create(molecule)); } @@ -155,8 +173,38 @@ public void readFile(File file) { molecule.setProperty("ACQUISITION_DATE", dtf.format(localDate)); + SourceNaturalProduct sourceNaturalProduct = ac2snp.createSNPlInstance(molecule); + + sourceNaturalProduct.setContinent(databaseTypeChecker.checkContinent(this.source)); + + String taxa = databaseTypeChecker.checkKingdom(this.source); + if(taxa.equals("mixed")){ + //do things db by db + if(source.equals("nubbedb")){ + //there is a p at the beginning of each id for plants + if(molecule.getID().startsWith("p.")){ + taxa = "plants"; + }else{ + taxa="animals"; + } + } + else if(source.equals("npatlas")){ + if(molecule.getID().startsWith("b")){ + taxa = "bacteria"; + }else{ + taxa="fungi"; + } + } + else{ + taxa="notax"; + } + } + sourceNaturalProduct.setOrganismText(new ArrayList()); + sourceNaturalProduct.organismText.add(taxa); + + if(!moleculeChecker.isForbiddenMolecule(molecule)){ - sourceNaturalProductRepository.save(ac2snp.createSNPlInstance(molecule)); + sourceNaturalProductRepository.save(sourceNaturalProduct); } } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/ReadWorker.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/ReadWorker.java index ae8a464..1e68ed5 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/ReadWorker.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/ReadWorker.java @@ -2,7 +2,13 @@ import java.io.File; import java.util.ArrayList; + +import de.unijena.cheminf.npopensourcecollector.misc.BeanUtil; +import de.unijena.cheminf.npopensourcecollector.mongocollections.NPDatabase; +import de.unijena.cheminf.npopensourcecollector.mongocollections.NPDatabaseRepository; +import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProductRepository; import org.openscience.cdk.interfaces.IAtomContainer; +import org.springframework.beans.factory.annotation.Autowired; public class ReadWorker { @@ -20,13 +26,22 @@ public class ReadWorker { private Reader reader = null ; + NPDatabaseRepository npDatabaseRepository; + + public ReadWorker(String fileName){ + npDatabaseRepository = BeanUtil.getBean(NPDatabaseRepository.class); + this.fileToRead = new File(fileName); //System.out.println("\n\n Working on: "+fileToRead.getName() + "\n\n"); System.out.println("\n\n Working on: "+fileToRead.getAbsolutePath() + "\n\n"); + NPDatabase newDB = new NPDatabase(); + newDB.setLocalFileName(fileToRead.getAbsolutePath()); + + npDatabaseRepository.save(newDB); acceptFileFormat = acceptFile(fileName); @@ -77,6 +92,9 @@ else if (filename.endsWith("inchi") || ){ this.submittedFileFormat="inchi"; return true; + }else if(filename.endsWith("csv") ){ + this.submittedFileFormat="csv"; + return true; } @@ -102,6 +120,9 @@ else if(this.submittedFileFormat.equals("smi")){ else if(this.submittedFileFormat.equals("inchi")){ reader = new InChiReader(); } + else if(this.submittedFileFormat.equals("csv")){ + reader = new CSVReader(); + } this.reader.readFile(this.fileToRead); diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/ReaderService.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/ReaderService.java index 78e7ca1..cee6843 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/ReaderService.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/ReaderService.java @@ -21,10 +21,6 @@ public class ReaderService { public boolean directoryContainsMolecularFiles(String directory){ boolean molecularFileFound = false; - - - - try (Stream walk = Files.walk(Paths.get(directory))) { this.molecularFiles = walk.filter(Files::isRegularFile) @@ -32,7 +28,8 @@ public boolean directoryContainsMolecularFiles(String directory){ for(String f : this.molecularFiles){ - if(f.contains("sdf") || f.contains("smi") || f.contains("mol") || f.contains("inchi")){ + System.out.println(f); + if(f.contains("sdf") || f.contains("smi") || f.contains("mol") || f.contains("inchi") || f.contains("csv")){ molecularFileFound = true; } } @@ -40,9 +37,6 @@ public boolean directoryContainsMolecularFiles(String directory){ } catch (IOException e) { e.printStackTrace(); } - - - return molecularFileFound; } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/SDFReader.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/SDFReader.java index 562a5f3..d89c971 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/SDFReader.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/SDFReader.java @@ -1,27 +1,34 @@ package de.unijena.cheminf.npopensourcecollector.readers; +import de.unijena.cheminf.npopensourcecollector.misc.DatabaseTypeChecker; +import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProduct; import de.unijena.cheminf.npopensourcecollector.services.AtomContainerToSourceNaturalProductService; import de.unijena.cheminf.npopensourcecollector.misc.BeanUtil; import de.unijena.cheminf.npopensourcecollector.misc.MoleculeChecker; import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProductRepository; import net.sf.jniinchi.INCHI_OPTION; import org.openscience.cdk.DefaultChemObjectBuilder; +import org.openscience.cdk.aromaticity.Kekulization; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.inchi.InChIGenerator; import org.openscience.cdk.inchi.InChIGeneratorFactory; +import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; import org.openscience.cdk.io.iterator.IteratingSDFReader; +import org.openscience.cdk.silent.SilentChemObjectBuilder; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; +import org.openscience.cdk.tools.CDKHydrogenAdder; +import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; +import org.springframework.context.annotation.Bean; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.time.LocalDate; import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.List; +import java.util.*; public class SDFReader implements Reader{ @@ -36,16 +43,20 @@ public class SDFReader implements Reader{ MoleculeChecker moleculeChecker; + DatabaseTypeChecker databaseTypeChecker; + String source; + public SDFReader(){ this.listOfMolecules = new ArrayList(); sourceNaturalProductRepository = BeanUtil.getBean(SourceNaturalProductRepository.class); ac2snp = BeanUtil.getBean(AtomContainerToSourceNaturalProductService.class); moleculeChecker = BeanUtil.getBean(MoleculeChecker.class); + databaseTypeChecker = BeanUtil.getBean(DatabaseTypeChecker.class); } @@ -80,25 +91,48 @@ public void readFile(File file) { + // Molecule original information boolean foundOriginalSmiles = false; molecule.setProperty("ORIGINAL_INCHI", ""); molecule.setProperty("ORIGINAL_INCHIKEY", ""); + + //trick to avoid having a molecule without even implicit hydrogens + try { + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(molecule); + CDKHydrogenAdder adder = CDKHydrogenAdder.getInstance(SilentChemObjectBuilder.getInstance()); + adder.addImplicitHydrogens(molecule); + System.out.println("Problem with adding implicit hydrogens and molecule configuration"); + System.out.println(molecule); + + Kekulization.kekulize(molecule); + + System.out.println("Kekulization problem"); + System.out.println(molecule); + + }catch(CDKException e){ + System.out.println("Problem with molecule in "+ source); + //System.out.println(molecule); + } + + IAtomContainer tmpMolecule = molecule.clone(); for(Object p : molecule.getProperties().keySet()){ if(p.toString().toLowerCase().contains("smiles")){ - molecule.setProperty("ORIGINAL_SMILES", molecule.getProperty(p)); + tmpMolecule.setProperty("ORIGINAL_SMILES", molecule.getProperty(p)); foundOriginalSmiles = true; } - if(p.toString().toLowerCase().contains("inchi") && !(p.toString().toLowerCase().contains("inchikey") || p.toString().toLowerCase().contains("inchi_key")) ){ - molecule.setProperty("ORIGINAL_INCHI", molecule.getProperty(p)); + if(p.toString().toLowerCase().contains("inchi") && !(p.toString().toLowerCase().contains("inchikey") || p.toString().toLowerCase().contains("inchi_key") || p.toString().toLowerCase().contains("inchi key")) ){ + tmpMolecule.setProperty("ORIGINAL_INCHI", molecule.getProperty(p)); } - if(p.toString().toLowerCase().contains("inchikey") || p.toString().toLowerCase().contains("inchi_key")){ - molecule.setProperty("ORIGINAL_INCHIKEY", molecule.getProperty(p)); + if(p.toString().toLowerCase().contains("inchikey") || p.toString().toLowerCase().contains("inchi_key") || p.toString().toLowerCase().contains("inchi key")){ + tmpMolecule.setProperty("ORIGINAL_INCHIKEY", molecule.getProperty(p)); } } + molecule = tmpMolecule; + if(!foundOriginalSmiles) { molecule.setProperty("ORIGINAL_SMILES", smilesGenerator.create(molecule)); @@ -156,16 +190,59 @@ public void readFile(File file) { molecule.setProperty("ACQUISITION_DATE", dtf.format(localDate)); + SourceNaturalProduct sourceNaturalProduct = ac2snp.createSNPlInstance(molecule); + + sourceNaturalProduct.setContinent(databaseTypeChecker.checkContinent(this.source)); + + String taxa = databaseTypeChecker.checkKingdom(this.source); + if(taxa.equals("mixed")){ + //do things db by db + if(source.equals("nubbedb")){ + //there is a p at the beginning of each id for plants + if(molecule.getID().startsWith("p.")){ + taxa = "plants"; + }else{ + taxa="animals"; + } + } + else if(source.equals("npatlas")){ + if(molecule.getID().startsWith("b")){ + taxa = "bacteria"; + }else{ + taxa="fungi"; + } + } + else{ + taxa="notax"; + } + } + sourceNaturalProduct.setOrganismText(new ArrayList()); + sourceNaturalProduct.organismText.add(taxa); + + + Hashtable> sdfMetaData = searchMetaData(molecule); + if(sdfMetaData.containsKey("name")){ + sourceNaturalProduct.setName(sdfMetaData.get("name").get(0)); + } + + if(sdfMetaData.containsKey("synonyms")){ + sourceNaturalProduct.setSynonyms(sdfMetaData.get("synonyms")); + } + + if(sdfMetaData.containsKey("citations")){ + sourceNaturalProduct.setCitation(sdfMetaData.get("citations")); + } + if(!moleculeChecker.isForbiddenMolecule(molecule)){ - sourceNaturalProductRepository.save(ac2snp.createSNPlInstance(molecule)); + sourceNaturalProductRepository.save(sourceNaturalProduct); } } } catch (Exception ex) { - //ex.printStackTrace(); + ex.printStackTrace(); } count++; //System.out.println(count); @@ -202,4 +279,71 @@ public String returnSource() { } + public Hashtable> searchMetaData(IAtomContainer molecule){ + Hashtable> foundMetaData = new Hashtable<>(); + + + for(Object p : molecule.getProperties().keySet()) { + + + //search for name + if (p.toString().toLowerCase().contains("name") && !(p.toString().toLowerCase().contains("database") )) { + //check if list already created or not and add to it + String n = molecule.getProperty(p); + if(foundMetaData.containsKey("name")){ + foundMetaData.get("name").add(n); + }else{ + foundMetaData.put("name" , new ArrayList<>()); + foundMetaData.get("name").add(n); + } + + } + + if(p.toString().toLowerCase().contains("synonym")){ + //see if need to split - split on ; + + //check if list already created or not and add to it + String [] s = molecule.getProperty(p).toString().split(";"); + + if(foundMetaData.containsKey("synonyms")){ + for(String minis : s) { + foundMetaData.get("synonyms").add(minis); + } + }else{ + foundMetaData.put("synonyms" , new ArrayList<>()); + for(String minis : s) { + foundMetaData.get("synonyms").add(minis); + } + } + + } + + if(p.toString().toLowerCase().contains("pubmed_citation") || p.toString().toLowerCase().contains("citation") || p.toString().toLowerCase().contains("pubmed") + || p.toString().toLowerCase().contains("doi") || p.toString().toLowerCase().contains("pmc")){ + + // split on ; + //check if list already created or not and add to it + String [] cit = molecule.getProperty(p).toString().split(";"); + + if(foundMetaData.containsKey("citations")){ + for(String c : cit) { + foundMetaData.get("citations").add(c); + } + }else{ + foundMetaData.put("citations" , new ArrayList<>()); + for(String c : cit) { + foundMetaData.get("citations").add(c); + } + } + } + + } + + + + + return foundMetaData; + } + + } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/SMILESReader.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/SMILESReader.java index 114e16d..6be9019 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/SMILESReader.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/readers/SMILESReader.java @@ -1,7 +1,9 @@ package de.unijena.cheminf.npopensourcecollector.readers; import de.unijena.cheminf.npopensourcecollector.misc.BeanUtil; +import de.unijena.cheminf.npopensourcecollector.misc.DatabaseTypeChecker; import de.unijena.cheminf.npopensourcecollector.misc.MoleculeChecker; +import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProduct; import de.unijena.cheminf.npopensourcecollector.mongocollections.SourceNaturalProductRepository; import de.unijena.cheminf.npopensourcecollector.services.AtomContainerToSourceNaturalProductService; import net.sf.jniinchi.INCHI_OPTION; @@ -20,6 +22,7 @@ import java.time.LocalDate; import java.time.format.DateTimeFormatter; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; public class SMILESReader implements Reader { @@ -31,6 +34,7 @@ public class SMILESReader implements Reader { SourceNaturalProductRepository sourceNaturalProductRepository; AtomContainerToSourceNaturalProductService ac2snp; MoleculeChecker moleculeChecker; + DatabaseTypeChecker databaseTypeChecker; String source; @@ -39,6 +43,7 @@ public SMILESReader(){ sourceNaturalProductRepository = BeanUtil.getBean(SourceNaturalProductRepository.class); ac2snp = BeanUtil.getBean(AtomContainerToSourceNaturalProductService.class); moleculeChecker = BeanUtil.getBean(MoleculeChecker.class); + databaseTypeChecker = BeanUtil.getBean(DatabaseTypeChecker.class); } @@ -56,9 +61,7 @@ public void readFile(File file) { this.source = file.getName().toLowerCase().replace(".smi", ""); - try{ - - smilesReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file))); + try(BufferedReader smilesReader = new BufferedReader(new FileReader(file))) { System.out.println("SMILES reader creation and inserting in MongoDB for "+source); @@ -77,8 +80,21 @@ public void readFile(File file) { molecule.setProperty("MOL_NUMBER_IN_FILE", Integer.toString(count)); - molecule.setProperty("ID", splitted[1]); - molecule.setID(splitted[1]); + if(splitted.length==2) { + molecule.setProperty("ID", splitted[1]); + molecule.setID(splitted[1]); + }else if(splitted.length==1){ + //no id + molecule.setProperty("ID", Integer.toString(count)); + molecule.setID(Integer.toString(count)); + }else if(splitted.length>2){ + //join everything after 1 + ArrayList splitted2 = new ArrayList(Arrays.asList(splitted)); + + String nid = String.join(" ", splitted2); + molecule.setProperty("ID", nid); + molecule.setID(nid); + } molecule.setProperty("FILE_ORIGIN", file.getName().replace(".smi", "")); @@ -134,8 +150,38 @@ public void readFile(File file) { molecule.setProperty("ACQUISITION_DATE", dtf.format(localDate)); + + SourceNaturalProduct sourceNaturalProduct = ac2snp.createSNPlInstance(molecule); + + sourceNaturalProduct.setContinent(databaseTypeChecker.checkContinent(this.source)); + + String taxa = databaseTypeChecker.checkKingdom(this.source); + if(taxa.equals("mixed")){ + //do things db by db + if(source.equals("nubbedb")){ + //there is a p at the beginning of each id for plants + if(molecule.getID().startsWith("p.")){ + taxa = "plants"; + }else{ + taxa="animals"; + } + } + else if(source.equals("npatlas")){ + if(molecule.getID().startsWith("b")){ + taxa = "bacteria"; + }else{ + taxa="fungi"; + } + } + else{ + taxa="notax"; + } + } + sourceNaturalProduct.setOrganismText(new ArrayList()); + sourceNaturalProduct.organismText.add(taxa); + if(!moleculeChecker.isForbiddenMolecule(molecule)){ - sourceNaturalProductRepository.save(ac2snp.createSNPlInstance(molecule)); + sourceNaturalProductRepository.save(sourceNaturalProduct); } @@ -144,6 +190,9 @@ public void readFile(File file) { } catch (InvalidSmilesException e) { e.printStackTrace(); + System.out.println(line); + System.out.println(splitted); + System.out.println(splitted[0]); smilesReader.skip(count - 1); } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/services/FragmentCalculatorService.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/FragmentCalculatorService.java index 20876ea..393d996 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/services/FragmentCalculatorService.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/FragmentCalculatorService.java @@ -55,6 +55,9 @@ public void doWork(){ List allNP = uniqueNaturalProductRepository.findAll(); + int count=1; + int total=allNP.size(); + for(UniqueNaturalProduct np : allNP){ @@ -134,9 +137,15 @@ public void doWork(){ uniqueNaturalProductRepository.save(np); } + count++; + if(count%10000==0){ + System.out.println("Molecules fragmented: "+count+" ("+(double)count/(double)total+"% )"); + + } } + System.out.println("Done fragmenting natural products"); } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/services/MolecularFeaturesComputationService.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/MolecularFeaturesComputationService.java index bba8e14..9eeed90 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/services/MolecularFeaturesComputationService.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/MolecularFeaturesComputationService.java @@ -262,6 +262,7 @@ public void doWork(){ uniqueNaturalProductRepository.save(np); } + System.out.println("done"); } @@ -501,7 +502,7 @@ public void doWorkForSM(){ } - + System.out.println("done"); } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/services/NPUnificationService.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/NPUnificationService.java index cb01781..45d6419 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/services/NPUnificationService.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/NPUnificationService.java @@ -68,8 +68,6 @@ public void doWork(){ unp = computeAdditionalMolecularFeatures(unp); uniqueNaturalProductRepository.save(unp); - - //the fragmentation should go to another service } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/services/SimilarityComputationService.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/SimilarityComputationService.java index 5726d9c..5b2b629 100644 --- a/src/main/java/de/unijena/cheminf/npopensourcecollector/services/SimilarityComputationService.java +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/SimilarityComputationService.java @@ -1,10 +1,12 @@ package de.unijena.cheminf.npopensourcecollector.services; +import com.google.common.collect.Lists; import com.google.common.collect.Sets; import de.unijena.cheminf.npopensourcecollector.mongocollections.NPSimilarity; import de.unijena.cheminf.npopensourcecollector.mongocollections.NPSimilarityRepository; import de.unijena.cheminf.npopensourcecollector.mongocollections.UniqueNaturalProduct; import de.unijena.cheminf.npopensourcecollector.mongocollections.UniqueNaturalProductRepository; +import org.checkerframework.common.aliasing.qual.Unique; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.fingerprint.Fingerprinter; import org.openscience.cdk.fingerprint.IBitFingerprint; @@ -15,6 +17,10 @@ import org.springframework.stereotype.Service; import java.util.*; +import java.util.concurrent.Callable; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.ThreadPoolExecutor; @Service public class SimilarityComputationService { @@ -30,6 +36,12 @@ public class SimilarityComputationService { private Set> npPairs; + + private Integer numberOfThreads = 100 ; + + List> futures = new ArrayList>(); + + public void computeSimilarities(){ Fingerprinter fingerprinter = new Fingerprinter(); @@ -52,8 +64,8 @@ public void computeSimilarities(){ if (tanimoto_coefficient>=0.5){ NPSimilarity newSimilarity = new NPSimilarity(); - newSimilarity.setUniqueNaturalProduct1(pair.get(0)); - newSimilarity.setUniqueNaturalProduct2(pair.get(1)); + newSimilarity.setUniqueNaturalProductID1(pair.get(0).getId()); + newSimilarity.setUniqueNaturalProductID2(pair.get(1).getId()); newSimilarity.setTanimoto(tanimoto_coefficient); //newSimilarity.setDistanceMoment(distance_moment); @@ -71,17 +83,97 @@ public void computeSimilarities(){ public void generateAllPairs(){ + System.out.println("Computing pairs of NPs"); List allNP = uniqueNaturalProductRepository.findAll(); Set npset = new HashSet<>(allNP); - System.out.println("Computing pairs of NPs"); + this.npPairs = Sets.combinations( npset, 2); System.out.println("done"); } + + + public void doParallelizedWork(){ + + System.out.println("Start parallel computation of Tanimoto"); + + try{ + + Hashtable> hashtableOfNPPairs = new Hashtable<>(); + for(Set spair : npPairs) { + List pair = new ArrayList(spair); + hashtableOfNPPairs.put(pair.toString(), pair); + + } + + + ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numberOfThreads); + + + + List> npPairBatch = Lists.partition(new ArrayList(hashtableOfNPPairs.keySet()), 100); + + int taskcount = 0; + + List> todo = new ArrayList>(npPairBatch.size()); + + System.out.println("Total number of tasks:" + npPairBatch.size()); + + for(List stringNPBatch : npPairBatch){ + SimilarityComputationTask task = new SimilarityComputationTask(); + + ArrayList> pairBatch= new ArrayList<>(); + + for(String s : stringNPBatch){ + pairBatch.add(hashtableOfNPPairs.get(s)); + } + + task.setNpPairsToCompute(pairBatch); + taskcount++; + + System.out.println("Task "+taskcount+" created"); + task.taskid=taskcount; + + Future f = executor.submit(task); + + futures.add(f); + + //executor.execute(task); + + System.out.println("Task "+taskcount+" executing"); + + + } + + + + + } catch (Exception e) { + e.printStackTrace(); + } + + + } + + + public boolean processFinished(){ + + boolean allFuturesDone = true; + + for(Future future : this.futures){ + + allFuturesDone &= future.isDone(); + + } + + + System.out.println("Finished parallel computation of Tanimoto"); + return allFuturesDone; + } } diff --git a/src/main/java/de/unijena/cheminf/npopensourcecollector/services/SimilarityComputationTask.java b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/SimilarityComputationTask.java new file mode 100644 index 0000000..822512d --- /dev/null +++ b/src/main/java/de/unijena/cheminf/npopensourcecollector/services/SimilarityComputationTask.java @@ -0,0 +1,85 @@ +package de.unijena.cheminf.npopensourcecollector.services; + + +import de.unijena.cheminf.npopensourcecollector.mongocollections.NPSimilarity; +import de.unijena.cheminf.npopensourcecollector.mongocollections.NPSimilarityRepository; +import de.unijena.cheminf.npopensourcecollector.mongocollections.UniqueNaturalProduct; +import org.openscience.cdk.exception.CDKException; +import org.openscience.cdk.fingerprint.Fingerprinter; +import org.openscience.cdk.fingerprint.IBitFingerprint; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.similarity.Tanimoto; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.data.annotation.Transient; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Propagation; +import org.springframework.transaction.annotation.Transactional; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + + +@Service +@Transactional(propagation = Propagation.REQUIRED, readOnly = false) +public class SimilarityComputationTask implements Runnable { + + + @Autowired + @Transient + NPSimilarityRepository npSimilarityRepository; + + @Autowired + @Transient + AtomContainerToUniqueNaturalProductService atomContainerToUniqueNaturalProductService; + + + ArrayList> npPairsToCompute; + + Integer taskid; + + @Override + public void run() { + Fingerprinter fingerprinter = new Fingerprinter(); + System.out.println("Computing similarities for task "+taskid); + + for(List spair : npPairsToCompute){ + List pair = new ArrayList(spair); + + IAtomContainer mol1 = atomContainerToUniqueNaturalProductService.createAtomContainer(pair.get(0)); + IAtomContainer mol2 = atomContainerToUniqueNaturalProductService.createAtomContainer(pair.get(1)); + + IBitFingerprint fingerprint1 = null; + IBitFingerprint fingerprint2 = null; + try { + fingerprint1 = fingerprinter.getBitFingerprint(mol1); + fingerprint2 = fingerprinter.getBitFingerprint(mol2); + double tanimoto_coefficient = Tanimoto.calculate(fingerprint1, fingerprint2); + + if (tanimoto_coefficient>=0.5){ + + NPSimilarity newSimilarity = new NPSimilarity(); + newSimilarity.setUniqueNaturalProductID1(pair.get(0).getId()); + newSimilarity.setUniqueNaturalProductID2(pair.get(1).getId()); + newSimilarity.setTanimoto(tanimoto_coefficient); + //newSimilarity.setDistanceMoment(distance_moment); + + npSimilarityRepository.save(newSimilarity); + } + + } catch (CDKException e) { + e.printStackTrace(); + } + + } + System.out.println("done"); + + } + + + public void setNpPairsToCompute(ArrayList> npPairsToCompute){ + this.npPairsToCompute = npPairsToCompute; + } + + +} diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index db34d33..83f3b98 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -3,7 +3,7 @@ #Mongo Config spring.data.mongodb.host=localhost spring.data.mongodb.port=27017 -spring.data.mongodb.database=COCONUTtest +spring.data.mongodb.database=COCONUTseptember24 #for MySQL