/*
Region of Interest
We term the regions of a protein sequence with biased residue composition, regions of interest (RoI).
The percentage of the sequence covered by such biased regions is designated RoI% as a fraction of the
total sequence length ie. including all signal sequence - where available
The algorithm for the identification of RoI is described here. Although it is currently implemented
in Java as part of the KNIME platform, it is a simple algorithm easily ported to other environments.
The algorithm proceed by processing each sequence independently: no state is preserved across
sequences - determination of proline rich regions is based solely on primary sequence alone.
The core algorithm accumulates proline-rich windows in a bit vector (a set bit denotes a proline-rich region,
clear bit not) of length set to the number of AA in the protein sequence. This bit vector is named “bv”.
Within the KNIME platform, two options are available:
- To report all RoI
- Report only those results belonging to proteins with at least 10% RoI
The function getHTMLCell() reports primary sequence colour-coded with RoI shown in blue, other regions in black.
The implementation provides all the additional cells to the input table for the KNIME node.
*/
package au.edu.unimelb.plantcell.core.biojava.tasks;
import java.util.ArrayList;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataType;
import org.knime.core.data.collection.CollectionCellFactory;
import org.knime.core.data.collection.ListCell;
import org.knime.core.data.def.DoubleCell;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.StringCell;
import org.knime.core.data.vector.bitvector.DenseBitVector;
import au.edu.unimelb.plantcell.core.cells.SequenceValue;
/**
* Emits metrics to aid in identification of HRGP proteins. These metrics are aimed at computing
* the %PAST, %P, %PSYK and %PVK within windows and identification of rich "regions" of these metrics.
* the task provides highlighted regions and numeric metrics for subsequent use.
*
* The key concept is that windows increase (from 8aa) as justified by %proline and the biased composition dictates.
* But for a window to be accepted as a starting window it must contain at least 20% proline ie. 2 out of 8. And only then if
* it satisfies one of %PAST, %PSYK or %PVK.
*
* Acknowledgment: Carolyn Schultz from University of Adelaide, South Australia
* @author andrew.cassin
*
*/
public class HRGPScreenTask extends BioJavaProcessorTask {
private int m_past, m_p, m_pvk, m_psyk;
private int m_n_regions;
private boolean want_all;
@Override
public void init(final String task_name, int input_column_index) throws Exception {
super.init(task_name, input_column_index);
want_all = (task_name.endsWith("(all)"));
}
@Override
public String getCategory() {
return "Protein Metrics";
}
@Override
public String[] getNames() {
return new String[] { "Screen for Proline-rich HRGP-like proteins", "Add Proline rich metrics (all)" };
}
@Override
public DataColumnSpec[] getColumnSpecs() {
DataColumnSpec[] cols = new DataColumnSpec[6];
cols[0] = new DataColumnSpecCreator("Region of interest (HTML)", StringCell.TYPE).createSpec();
cols[1] = new DataColumnSpecCreator("Window lengths satisfying threshold", ListCell.getCollectionType(DoubleCell.TYPE)).createSpec();
cols[2] = new DataColumnSpecCreator("RoI Coverage (%) of predicted protein", DoubleCell.TYPE).createSpec();
cols[3] = new DataColumnSpecCreator("Number of distinct regions satisfying threshold", IntCell.TYPE).createSpec();
cols[4] = new DataColumnSpecCreator("Window size (AA)", IntCell.TYPE).createSpec();
cols[5] = new DataColumnSpecCreator("Windows satisfying threshold (list)", ListCell.getCollectionType(StringCell.TYPE)).createSpec();
return cols;
}
@Override
public String getHTMLDescription(String task) {
return "Adds metrics to aid in identification of proteins related to HRGP's (incl. Extensins, AGPs etc.)."+
"Requires protein sequence. The 'screen' task only reports values if %RoI is at least 10% (otherwise missing), the 'add metrics'" +
"task reports values for all input sequences. The screen task is more space efficient for really large numbers of sequences.";
}
/**
* Responsible for computing the results for the given row. Changes to this must be compatible with the KNIME platform ie. must match
* the column specification as given by getColumnSpecs()
. Performance is important for working with 1kp...
*/
@Override
public DataCell[] getCells(DataRow r) {
SequenceValue sv = getSequenceForRow(r);
DataCell[] cells = missing_cells(getColumnSpecs().length);
if (!sv.getSequenceType().isProtein())
return cells;
// compute windows
int len = sv.getLength();
int window_size = 8;
if (len < window_size) // not even a single full window available?
return cells;
DenseBitVector bv = new DenseBitVector(len);
boolean got_window_start = false;
int start_pos = -1;
int n_windows = 0;
String seq = sv.getStringValue();
for (int i=0; i 0 && (want_all || percent_roi >= 10.0)) {
cells[0] = getHTMLCell(seq, bv);
getWindows(seq, bv, cells); // side effects: m_n_windows and cells[1] and cells[5]
cells[2] = new DoubleCell(percent_roi);
cells[3] = new IntCell(m_n_regions);
cells[4] = new IntCell(window_size);
}
return cells;
}
/**
* Process the set of "rich regions" denoted by set bits in bv
and extract the
* length and polypeptide sequence from each into the output cells for the plugin.
*
* @param seq
* @param bv one bits in this vector denote the proline rich regions. Must have at least seq.length()
bits in the vector
* @param cells some elements (which must have a size of at least getColumnSpecs().length/code>) are side-effected by this call
*/
private void getWindows(final String seq, final DenseBitVector bv, DataCell[] cells) {
ArrayList col = new ArrayList();
ArrayList ret = new ArrayList();
long start = 0;
while ((start = bv.nextSetBit(start)) >= 0) {
long end = bv.nextClearBit(start+1);
if (end < 0) {
col.add(new IntCell((int)(bv.length()-start)));
ret.add(new StringCell(seq.substring((int) start)));
break; // last region in input so thats it...
} else {
col.add(new IntCell((int)(end-start)));
ret.add(new StringCell(seq.substring((int) start, (int) end)));
start = end;
}
}
// no windows? cells already contains missing values so just return
if (col.size() < 1)
return;
// else...
m_n_regions = col.size();
cells[1] = CollectionCellFactory.createListCell(col);
cells[5] = CollectionCellFactory.createListCell(ret);
}
private DataCell getHTMLCell(String seq, DenseBitVector bv) {
if (seq == null || bv == null || seq.length() != bv.length())
return DataType.getMissingCell();
StringBuilder sb = new StringBuilder(seq.length());
sb.append("");
for (int i=0; i");
sb.append(c);
sb.append("");
} else {
sb.append(c);
}
}
return new StringCell(sb.toString());
}
/**
* The current window will be accepted for exactly one of three reasons:
* 1) based on window_size
the %PAST is >= 70 with %P at least 20%
* 2) based on window_size
the %PSYK is >= 70 with %P at least 20%
* 3) based on window_size
the %PVK is >= 70 with %P at least 20%
* Otherwise false is returned which will cause a search for later windows meeting the above criteria.
*
* @param window_size the size of the current window in number of AA - used for % calculations
* @return
*/
private boolean accept_window(int window_size) {
double percent_p = ((double)m_p)/window_size * 100;
if (percent_p < 20.0d)
return false;
double percent_past = ((double)m_past)/window_size * 100.0d;
if (percent_past >= 70.0d && percent_p >= 20.0d) {
return true;
}
double percent_psyk = ((double)m_psyk)/window_size * 100.0d;
if (percent_psyk >= 70.0d && percent_p >= 20.0d) {
return true;
}
double percent_pvk = ((double)m_pvk)/window_size * 100.0d;
if (percent_pvk >= 70.0d && percent_p >= 20.0d)
return true;
return false;
}
/**
* Simple implementation as it keeps recalculating the same residues... but for now...
* WARNING: this method alters m_{past,p,pvk,psyk} so be careful when you call this.
*
* @param window
*/
private void compute_window(String window) {
assert(window != null && window.length() > 0);
m_past = 0;
m_p = 0;
m_pvk = 0;
m_psyk = 0;
for (int j=0; j