Pearson Linear Correlation Co-efficient Calculator

Example 1: The first example shows a 'schema level' script written in the Groovy language. The script will calculate the Pearson linear correlation coefficient for two variables. The data for the variables are obtained as entity.field(s) contained in a given data set defined in SQL. We think this script might be highly useful when deciding upon the distinct variables to employ in the construction and analysis of QSAR expressions/models, derived from your data model. In order to help show the script as a useful template, for other such approaches, we can break the script into it's three conceptual parts:

A Groovy class called 'QueryBuilder' which will return a 'sample' result set from a RDBMS table.
A Groovy function called 'Pearson' that implements the correlation coefficient over the sample set
A Groovy SWING window which provides the user interface to the record set and Pearson computation

QueryBuilder class This class obtains an internal injected connection and executes the stated SQL statement in order to obtain the sample results set. This class can be instantiated with a SQL WHERE clause filter. It returns the fields from a JChem table separately for binding to the SWING controls and for use in the calculations. Currently, the SQL must be edited in the class, by the user who runs the script. The entity in use is a JChem structures entity, with logP and TPSA field calculations already added.

Pearson function This function accepts two lists of data (X and Y) and returns the Pearson Correlation coefficient. It is easy to replace this function with your own linear correlation coefficient implementation.

Swing interface The SWING interface shows the results of the SQL statement and provides two navigation buttons 'Next' and 'Previous'. There are two drop down boxes that can be used to select the 'X' and 'Y' variable choices for the Pearson calculation. Finally, there is a button to compute the coefficient. This calls the function and sets the adjacent text box value with the return result 'r'. You are encouraged to try different SWING formats! Currently the drop down box is hard coded with known fields for the entity defined.

Example 2: The second example is a more evolved version. The code(function) for calculating the Pearson coefficient now includes a call to an external grape library. This should be a more efficient way and might handle much larger X and Y data sets better than completing the same calculations in the Groovy language. Coming soon in version 5.9!


import groovy.sql.*
import java.sql.*
import groovy.swing.SwingBuilder
import java.awt.*
import javax.swing.*
import java.util.*
import org.apache.derby.jdbc.EmbeddedDataSource40

// dbutler@chemaxon.com
// Jan 2012
// Small record navigator GUI to obtain Pearson linear correlation co-efficient for choice of input fields
// Input is SQL results & choice of fields is given in drop down box
// https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

// Slot your connection in here and build more complex queries
class QueryBuilder
{
    def ds = new EmbeddedDataSource40();

    def String filter = " where cd_id < 10"  // to amend the sample size for r, amend this filter and perhaps use where rownum < sample or some random function

    QueryBuilder (String aSQLFilter)                                  // edit with your database
    {if (!aSQLFilter==null) {filter=aSQLFilter}; ds.databaseName = 'C:\\Users\\daniel\\Documents\\IJCProjects\\RelationalFormTutorial\\\.config\\RelationalFormExample\\db'; cd_id(); cd_smiles(); cd_formula(); cd_molweight(); logp(); tpsa();} // initialise each collection

    // set of populate functions
     def cd_id = []
     def cd_smiles = []
     def cd_formula = []
     def cd_molweight = []
     def logp = []
     def tpsa = []                                    // edit with your table
     def cd_id () {Sql sql = new Sql(ds); sql.eachRow('select cd_id from Structures' + this.filter) {row -> this.cd_id << "$row.cd_id"}; sql.close();}
     def cd_smiles () {Sql sql = new Sql(ds); sql.eachRow('select cd_smiles from Structures' + this.filter) {  row -> this.cd_smiles << "$row.cd_smiles"}; sql.close();}
     def cd_formula ()  {Sql sql = new Sql(ds); sql.eachRow('select cd_formula from Structures' + this.filter) {  row -> this.cd_formula << "$row.cd_formula"}; sql.close();}
     def cd_molweight ()  {Sql sql = new Sql(ds); sql.eachRow('select cd_molweight from Structures' + this.filter) {  row -> this.cd_molweight << "$row.cd_molweight"}; sql.close();}
     def logp ()  {Sql sql = new Sql(ds); sql.eachRow('select logp from Structures' + this.filter) {  row -> this.logp << "$row.logp"}; sql.close();}
     def tpsa ()  {Sql sql = new Sql(ds); sql.eachRow('select tpsa from Structures' + this.filter) {  row -> this.tpsa << "$row.tpsa"}; sql.close();}

}

  QueryBuilder q = new  QueryBuilder()

   // #Benefit can slot in any function that operates over x and y data sets 
  def double pearson (idx_x,idx_y,FIELDS,q)
  {
      double r = 0
      double x_bar = 0
      double y_bar = 0
      double x_var = 0
      double y_var = 0
      double x_sd = 0
      double y_sd = 0
      double count_x = 0
      double count_y = 0

      set_x = []
      set_y = []

      // #improvement Prefer to be able to reference like FIELDS[idx_x] and then q."FIELDS[idx_x]" <= "the dynamic string"

      if (idx_x==0) {set_x = q.cd_molweight.each {set_x << it.toDouble()}}
       if (idx_x==1) {set_x = q.logp.each {set_x << it.toDouble()}}
         if (idx_x==2) {set_x = q.tpsa.each {set_x << it.toDouble()}}

      if (idx_y==0) {set_y = q.cd_molweight.each {set_y << it.toDouble()}}
       if (idx_y==1) {set_y = q.logp.each {set_y << it.toDouble()}}
         if (idx_y==2) {set_y = q.tpsa.each {set_y << it.toDouble()}}

      set_x.each {x_bar += it.toDouble(); count_x++}
      x_bar = x_bar / count_x

      set_y.each {y_bar += it.toDouble(); count_y++}
      y_bar = y_bar / count_y

      set_x.each {x_var += Math.pow(it.toDouble() - x_bar,2)}
      x_var = x_var / count_x
      x_sd = Math.sqrt(x_var.toDouble())

      set_y.each {y_var += Math.pow(it.toDouble() - y_bar,2)}
      y_var = y_var / count_y
      y_sd = Math.sqrt(y_var.toDouble())

      set_x = set_x.collect{(it.toDouble()-x_bar)/x_sd}
      set_y = set_y.collect{(it.toDouble()-y_bar)/y_sd}

      if (count_x == count_y)
      {
      for (i in 0 .. set_x.size()-1)
      {
      r+= set_x [i] * set_y [i]
      }
      r =  r / (count_x) 
      }

      return r

  }

swing = new SwingBuilder()

// #improvement - Prefer populate for all data type NUMBER
FIELDS = ["CD_MOLWEIGHT","LOGP","TPSA"]
count = 0 // zero based index gives first row in results set
hits = q.cd_id.size()
// #improvement - try different SWING layouts to optimise view 
frame = swing.frame(title: "Pearson R explorer", layout: new GridLayout(4,3), size: [300,300],defaultCloseOperation: WindowConstants.DISPOSE_ON_CLOSE) 
{

       cd_idfield = textField(text: "cd_id:" + q.cd_id[0]) // initialise with first row
       cd_smilesfield = textField(text: "cd_smiles:" + q.cd_smiles[0]) // initialise with first row
       cd_formulafield = textField(text: "cd_formula:" + q.cd_formula[0]) // initialise with first row
       cd_molweightfield = textField(text: "cd_molweight:" + q.cd_molweight[0]) // initialise with first row
       logpfield = textField(text: "logp:" + q.logp[0]) // initialise with first row
       tpsafield = textField(text: "tpsa:" + q.tpsa[0]) // initialise with first row

       button(text:'Next record',actionPerformed: {if (count<hits-1) {count++}; cd_idfield.text =q.cd_id[count]; cd_smilesfield.text = q.cd_smiles[count]; cd_formulafield.text = q.cd_formula[count]; cd_molweightfield.text=q.cd_molweight[count]; logpfield.text = q.logp[count]; tpsafield.text = q.tpsa[count];  println "Next"})
       button(text:'Previous record',actionPerformed: {if (count>0){count--}; cd_idfield.text =q.cd_id[count]; cd_smilesfield.text = q.cd_smiles[count]; cd_formulafield.text = q.cd_formula[count]; cd_molweightfield.text=q.cd_molweight[count]; logpfield.text = q.logp[count]; tpsafield.text = q.tpsa[count]; println "Previous"})     

       X = comboBox(id: 'X', items: FIELDS, selectedIndex: 0) 
       Y = comboBox(id: 'Y', items: FIELDS, selectedIndex: 0)

       pearson = label(text: "Pearson Co-efficient:")
       button(text:'Compute Pearson',actionPerformed: {pearson.text =  String.format("%15.2f", pearson(X.selectedIndex,Y.selectedIndex,FIELDS,q)); println "Pearson r :Done!"})     

}
frame.show()