PDF Trawler

    /**
     * PDF file loader.
     * Trawls a directory and all its subdirectories, and looks for PDF files.
     * Extracts the individual molecules from the file and loads them into the
     * structure entity at the root of the data tree.
     *
     * Usage:
     * 1. create a structure entity in the project explorer
     * 2. Add a fields named 'Filename' and 'Name'
     * 3. Edit the settings in the 'adjust these variables' section (the defaults are for the
     * Pubchem demo data tree in the sample project)
     * 4. Run the script
     *
     * @author Tim Dudgeon
     * @revrited 2022
     */
    
    import groovy.io.FileType
    import chemaxon.formats.MolImporter
    import com.im.commons.progress.*
    import com.im.df.api.chem.MarvinStructure
    import chemaxon.marvin.io.formats.d2s.D2SRecordReader
    
    // --------- adjust these variables --------------
    def pattern =  ~/.*\.pdf/ // pattern for file to process
    def root = new File('C:/Documents/chemaxon/pdfs') // dir to start at
    def STRUCTURE_FIELD = 'Structure' // name of structure field
    def FILE_FIELD = 'Filename' // name of file field
    def NAME_FIELD = 'Name' // name of the name field
    // ---------- end of variables -------------------
    
    def structF
    def filenameF
    def nameF
    def edp
    def traverse
    
    // ---------- this is the routine that process the file and loads it
    def perform = { file, envRW ->
        println "processing file $file"
        MolImporter importer = new MolImporter(file, "pdf")
        def mol = null
        int count = 0
        while (mol = importer.read()) {
            count++
            println "loading $count $mol"
            def vals = [ (structF.id) : new MarvinStructure(mol), (filenameF.id) : file.path, (nameF.id) : mol.name ]
            edp.insert(vals, null, envRW)
        }
    }
    
    def ety = dataTree.rootVertex.entity
    edp = ety.schema.dataProvider.getEntityDataProvider(ety)
    structF = ety.fields.items.find { it.name == STRUCTURE_FIELD }
    filenameF = ety.fields.items.find { it.name == FILE_FIELD }
    nameF = ety.fields.items.find { it.name == NAME_FIELD }
    println "Found fields ${structF.id} and ${filenameF.id}"
    
    traverse = { dir ->
        println "Looking at dir $dir"
        dir.eachFileMatch(FileType.FILES, pattern) { file ->
            // stop if the script is terminated
            if (env.getFeedback().isCancelled()) {
                def msg = "Importing molecules from $root interupted!"
                println msg
                throw new InterruptedException(msg)
            }
            edp.lockable.withLock('loading') { envRW ->
                perform(file, envRW)
            }
        }
        dir.eachDir(traverse)
    }
    
    // start the process off
    traverse(root)