Create a Diverse Subset

    /** Select a diverse subset of structures from amongst the current results.
     * The algorithm is pretty simplistic, randomly selecting a structure,
     * checking if it is similar to one already in the pool, and if not then adding
     * to the pool.
     * Similarity scores are obtained from the results of an overlap analysis that
     * must previously have been run.
     * The results are written out to the console and can be copied, and then pasted as a new list.
     *
     * Steps:
     * 1. Run an overlap analysis using the same structure entity as query and target (Tools -> Chemistry -> Overlap analysis).
     *    Specify the similarity threshold you want to use to to exclude molecules.
     * 2. Edit the parameters in the 'edit these settings' section
     * 3. Run the script
     *
     * @author Tim Dudgeon (tdudgeon@chemaxon.com)
     */
    
    import com.im.commons.progress.*
    
    // ---------- edit these settings ----------------------------------------------------
    
    def setSize = 200 // number of diverse structures to generate
    def OVERLAP_FIELD = 'Overlap hits' // field name of the overlap analysis hits field
    
    // ---------- probably no need to edit anything below here ---------------------------
    
    def parent = dataTree.rootVertex.entity // root entity
    def fldId = parent.idField // ID field
    println "found ID field ${fldId.id}"
    // overlap field
    def fldOvrlp = parent.fields.items.find { it.name == OVERLAP_FIELD }
    println "found overlap hits field ${fldOvrlp.id}"
    
    // ResultSet and VertexStates
    def rs = parent.schema.dataProvider.getDefaultResultSet(dataTree, false, DFEnvironmentRO.DEV_NULL)
    def parentVS = rs.getVertexState(dataTree.rootVertex)
    def ids = parentVS.ids
    println "Found $ids.size parent IDs to analyse"
    
    def subset = new LinkedHashSet()
    def rand = new Random()
    rs.lockable.withLock('selecting diverse subset') { envRW ->
        def idx = 0
        while (ids && subset.size() < setSize) {
            int pos = rand.nextInt(ids.size())
            int id = ids[pos]
            print "testing $id [$pos] ... "
            ids.remove(pos) // remove from the list so we don't look for it again
    
            try {
                def data = parentVS.getData([id], DFEnvironmentRO.DEV_NULL) // read data for this ID
                def sims = data[id][fldOvrlp.id]  // get the similarity report
                def found
                if (sims) {
                    matcher = sims =~ /(\d+) \([\d\\.]+\)/
                    found = matcher.find { subset.contains(Integer.parseInt(it[1])) }
                }
                if (found) {
                    println "excluded"
                } else {
                    println "added"
                    subset.add(id)
                }
            } catch (Exception exc) {
                println "EROROR Failed to load ID $id ${exc.toString()}"
            } finally {
                idx++
            }
        }
    }
    
    println "\nSubset selection complete. Found ${subset.size()} diverse entrees:\n\n" +  subset.join('\n') + '\n'