PDF Trawler

/**
 * PDF file loader.
 * Trawls a directory and all its subdirectories, and looks for PDF files.
 * Extracts the individual molecules from the file and loads them into the
 * structure entity at the root of the data tree.
 *
 * Usage:
 * 1. create a structure entity in the project explorer
 * 2. Add a fields named 'Filename' and 'Name'
 * 3. Edit the settings in the 'adjust these variables' section (the defaults are for the
 * Pubchem demo data tree in the sample project)
 * 4. Run the script
 *
 * @author Tim Dudgeon
 * @revrited 2022
 */

import groovy.io.FileType
import chemaxon.formats.MolImporter
import com.im.commons.progress.*
import com.im.df.api.chem.MarvinStructure
import chemaxon.marvin.io.formats.d2s.D2SRecordReader

// --------- adjust these variables --------------
def pattern =  ~/.*\.pdf/ // pattern for file to process
def root = new File('C:/Documents/chemaxon/pdfs') // dir to start at
def STRUCTURE_FIELD = 'Structure' // name of structure field
def FILE_FIELD = 'Filename' // name of file field
def NAME_FIELD = 'Name' // name of the name field
// ---------- end of variables -------------------

def structF
def filenameF
def nameF
def edp
def traverse

// ---------- this is the routine that process the file and loads it
def perform = { file, envRW ->
    println "processing file $file"
    MolImporter importer = new MolImporter(file, "pdf")
    def mol = null
    int count = 0
    while (mol = importer.read()) {
        count++
        println "loading $count $mol"
        def vals = [ (structF.id) : new MarvinStructure(mol), (filenameF.id) : file.path, (nameF.id) : mol.name ]
        edp.insert(vals, null, envRW)
    }
}

def ety = dataTree.rootVertex.entity
edp = ety.schema.dataProvider.getEntityDataProvider(ety)
structF = ety.fields.items.find { it.name == STRUCTURE_FIELD }
filenameF = ety.fields.items.find { it.name == FILE_FIELD }
nameF = ety.fields.items.find { it.name == NAME_FIELD }
println "Found fields ${structF.id} and ${filenameF.id}"

traverse = { dir ->
    println "Looking at dir $dir"
    dir.eachFileMatch(FileType.FILES, pattern) { file ->
        // stop if the script is terminated
        if (env.getFeedback().isCancelled()) {
            def msg = "Importing molecules from $root interupted!"
            println msg
            throw new InterruptedException(msg)
        }
        edp.lockable.withLock('loading') { envRW ->
            perform(file, envRW)
        }
    }
    dir.eachDir(traverse)
}

// start the process off
traverse(root)