Data Merger or Inserter from an SDF file

    This script merges data from a SD file into an IJC entity, based on a common key value. If that particular value is not found, a new row containing information is inserted.

    
    /** Merge data from a SD file into an IJC entity.
     *
     * Usage:
     * 1. load the first SD file into IJC using the standard import(script is configured for file 'Building_blocks_GBP_fixed.sdf')
     * 2. add a text field named MpBp
     * 3. edit the variables in the 'edit these settings' section (script is configured for file '2004 Building Blocks Database.sdf')
     * 4. execute the script
     *
     * Note: the script will run much faster if there is an index on the column for the merge field (casno in this case).
     * Note: there is special handling of the values for the CASno field from the SD file because of quirks with the data
     * from this field. If you are using this for your own data then you will need to remove this custom processing.
     *
     * @author Tim Dudgeon (tdudgeon@chemaxon.com)
     */
    
    import chemaxon.formats.MolImporter
    import chemaxon.struc.Molecule
    import com.im.df.api.dml.*
    import com.im.commons.progress.*
    import com.im.df.api.support.*
    
    // ---------- edit these settings ----------------------------------------------------
    String file = 'C:/data/structures/KeyOrganics/2004 Building Blocks Database.sdf' // the file to load
    String structureFieldName = 'Structure' // the name fo the structure field
    String mergePropName = 'CASno' // the name of the merge field in the file
    String mergeFieldName = 'casno' // the name of the merge property in the database
    // next two are a list of maps. key is the name in the file and value is the corresponding name of the field in the database
    Map insertValFieldNames = ['ID':'code', 'Purity':'purity'] // fields whose values are added only when inserting new row
    Map updateAndInsertValFieldNames = ['Mp/Bp':'MpBp'] // fields whose values are updated or inserted
    // ------------probably no need to edit anything below here ---------------------------
    
    MolImporter importer = new MolImporter(file)
    importer.grabbingEnabled = true
    Molecule mol = new Molecule()
    
    def parent = dataTree.rootVertex.entity
    def edp = parent.schema.dataProvider.getEntityDataProvider(parent)
    def structureField = parent.fields.items.find { it.name == structureFieldName }
    def mergeField = parent.fields.items.find { it.name == mergeFieldName }
    
    // find the fields - we need them later
    Map insertFields = [:]
    insertValFieldNames.each { k,v ->
        def fld = parent.fields.items.find { it.name == v }
        assert fld != null
        insertFields[k] = fld
    }
    
    Map updateAndInsertFields = [:]
    updateAndInsertValFieldNames.each { k,v ->
        def fld = parent.fields.items.find { it.name == v }
        assert fld != null
        updateAndInsertFields[k] = fld
    }
    
    def lock = edp.lockable.obtainLock('updating data')
    def env = EnvUtils.createDefaultEnvironmentRW(lock, 'updating data', true);
    
    int i = 1
    int updates = 0
    int inserts = 0
    try {
        println "Reading file"
        while (importer.read(mol)) {
            String molStr = importer.grabbedMoleculeString
            String mergeVal = MPropHandler.convertToString(mol.properties(), mergePropName);
            // custom processing of CASno values
            // strangely the cas numbers are surrounded by square brackets in this file so we need to remove them
            if (mergeVal) {
                mergeVal = mergeVal.substring(1, mergeVal.length() - 1)
            }
            Map insertVals = [:]
            insertValFieldNames.each { k,v ->
                insertVals[k] =  mol.getProperty(k)
            }
    
            Map updateAndInsertVals = [:]
            updateAndInsertValFieldNames.each { k,v ->
                updateAndInsertVals[k] =  mol.getProperty(k)
            }
    
            boolean update = false
            DFTermExpression q = null
            if (mergeVal) {
    
                q = DFTermsFactory.createFieldOperatorValueExpr(
                        Operators.EQUALS, mergeField, null, mergeVal)
    
                List ids = edp.queryForIds(q, SortDirective.EMPTY, env)
                if (ids) {
                    update = true
                    updates++
                } else {
                    inserts++
                }
            }
            if (update) {
                Map vals = [:]
                // add the values
                updateAndInsertFields.each { k,v ->
                    vals[v.id] = updateAndInsertVals[k]
                }
    
                DFUpdateDescription ud = DFUpdateDescription.create(parent, q, vals)
                Map<DFUpdateDescription, DFUpdateResult> result = edp.update([ud], DFUndoConfig.OFF, env)
                DFUpdateResult res = result[ud]
                println "$i updated ${res.changedRowsCount} rows"
            } else {
                // new map and add structure
                Map vals = [(structureField.id):molStr]
                // add the values for fields that are for insert only
                insertFields.each { k,v ->
                    vals[v.id] = insertVals[k]
                }
                // and add the ones that are for both insert and update
                updateAndInsertFields.each { k,v ->
                    vals[v.id] = updateAndInsertVals[k]
                }
                edp.insert(vals, null, env)
                println "$i inserted row"
            }
    
            i++
        }
    } finally {
        importer.close()
        env?.feedback.finish()
        lock?.release()
    }
    
    println "Finished. $updates updates, $inserts inserts"