CDX File Importer

    This script searches a directory recursively and looks for CDX files. It then extracts the invidual molecules from the file and loads them into the structure entity at the root of the dataTree. This script is an excellent example of searching entire directories for specific files.

    
    /**
     * CDX file loader.
     * Trawls a directory and all its subdirectories, and looks for CDX files.
     * Extracts the individual molecules from the file and loads them into the
     * structure entity at the root of the data tree.
     *
     * Usage:
     * 1. create a structure entity in the project explorer
     * 2. Add a field that will contain the name of the file that was the source of each molecule
     * 3. Edit the settings in the 'adjust these variables' section (the defaults are for the
     * Pubchem demo data tree in the sample project)
     * 4. Run the script
     *
     * @author Tim Dudgeon (tdudgeon@chemaxon.com)
     */
     
    import groovy.io.FileType
    import chemaxon.formats.MolImporter
    import com.im.commons.progress.*
    import com.im.df.api.chem.MarvinStructure
     
    // --------- adjust these variables --------------
    def pattern = ~/.*\\.cdx/ // pattern for file to process
    def root = new File('C:/Documents/chemaxon/cdxs') // dir to start at
    def STRUCTURE_FIELD = 'Structure' // name of structure field
    def FILE_FIELD = 'Filename' // name of file field
    // ---------- end of variables -------------------
     
    def structF
    def filenameF
    def edp
    def traverse
     
    // ---------- this is the routine that process the file and loads it
    def perform = { file, envRW ->
        def bytes = file.bytes
        def mol = MolImporter.importMol(bytes)
        def frags = mol.convertToFrags();
        frags.eachWithIndex { m, i ->
            println "loading $file molecule ${i+1}"
            def vals = [ (structF.id) : new MarvinStructure(m), (filenameF.id) : file.path ]
            edp.insert(vals, null, envRW)
        }
    }
     
    def ety = dataTree.rootVertex.entity
    edp = ety.schema.dataProvider.getEntityDataProvider(ety)
    structF = ety.fields.items.find { it.name == STRUCTURE_FIELD }
    filenameF = ety.fields.items.find { it.name == FILE_FIELD }
    println "Found fields ${structF.id} and ${filenameF.id}"
     
    traverse = { dir ->
        dir.eachFileMatch(FileType.FILES, pattern) { file ->
            // stop if the script is terminated
            if (env.getFeedback().isCancelled()) {
                def msg = "Importing molecules from $root interupted!"
                println msg
                throw new InterruptedException(msg)
            }
            edp.lockable.withLock('loading') { envRW ->
                perform(file, envRW)
            }
        }
        dir.eachDir(traverse)
    }
     
    // start the process off
    traverse(root)