CDX File Importer

This script searches a directory recursively and looks for CDX files. It then extracts the invidual molecules from the file and loads them into the structure entity at the root of the dataTree. This script is an excellent example of searching entire directories for specific files.


/**
 * CDX file loader.
 * Trawls a directory and all its subdirectories, and looks for CDX files.
 * Extracts the individual molecules from the file and loads them into the
 * structure entity at the root of the data tree.
 *
 * Usage:
 * 1. create a structure entity in the project explorer
 * 2. Add a field that will contain the name of the file that was the source of each molecule
 * 3. Edit the settings in the 'adjust these variables' section (the defaults are for the
 * Pubchem demo data tree in the sample project)
 * 4. Run the script
 *
 * @author Tim Dudgeon (tdudgeon@chemaxon.com)
 */
 
import groovy.io.FileType
import chemaxon.formats.MolImporter
import com.im.commons.progress.*
import com.im.df.api.chem.MarvinStructure
 
// --------- adjust these variables --------------
def pattern = ~/.*\\.cdx/ // pattern for file to process
def root = new File('C:/Documents/chemaxon/cdxs') // dir to start at
def STRUCTURE_FIELD = 'Structure' // name of structure field
def FILE_FIELD = 'Filename' // name of file field
// ---------- end of variables -------------------
 
def structF
def filenameF
def edp
def traverse
 
// ---------- this is the routine that process the file and loads it
def perform = { file, envRW ->
    def bytes = file.bytes
    def mol = MolImporter.importMol(bytes)
    def frags = mol.convertToFrags();
    frags.eachWithIndex { m, i ->
        println "loading $file molecule ${i+1}"
        def vals = [ (structF.id) : new MarvinStructure(m), (filenameF.id) : file.path ]
        edp.insert(vals, null, envRW)
    }
}
 
def ety = dataTree.rootVertex.entity
edp = ety.schema.dataProvider.getEntityDataProvider(ety)
structF = ety.fields.items.find { it.name == STRUCTURE_FIELD }
filenameF = ety.fields.items.find { it.name == FILE_FIELD }
println "Found fields ${structF.id} and ${filenameF.id}"
 
traverse = { dir ->
    dir.eachFileMatch(FileType.FILES, pattern) { file ->
        // stop if the script is terminated
        if (env.getFeedback().isCancelled()) {
            def msg = "Importing molecules from $root interupted!"
            println msg
            throw new InterruptedException(msg)
        }
        edp.lockable.withLock('loading') { envRW ->
            perform(file, envRW)
        }
    }
    dir.eachDir(traverse)
}
 
// start the process off
traverse(root)