Home > RAVEN > scoreModel.m

scoreModel

PURPOSE ^

scoreRxns

SYNOPSIS ^

function [rxnScores geneScores hpaScores arrayScores]=scoreModel(model,hpaData,arrayData,tissue,celltype,noGeneScore,multipleGeneScoring,multipleCellScoring,hpaLevelScores)

DESCRIPTION ^

 scoreRxns
   Scores the reactions and genes in a model based on expression data
   from HPA and/or gene arrays

   model               a model structure
   hpaData             HPA data structure from parseHPA (opt if arrayData is 
                       supplied, default [])
   arrayData           gene expression data structure (opt if hpaData is
                       supplied, default [])
       genes           cell array with the unique gene names
       tissues         cell array with the tissue names. The list may not be
                       unique, as there can be multiple cell types per tissue
       celltypes       cell array with the cell type names for each tissue
       levels          GENESxTISSUES array with the expression level for
                       each gene in each tissue/celltype. NaN should be
                       used when no measurement was performed
   tissue              tissue to score for. Should exist in either
                       hpaData.tissues or arrayData.tissues
   celltype            cell type to score for. Should exist in either
                       hpaData.celltypes or arrayData.celltypes for this 
                       tissue (opt, default is to use the best values
                       among all the cell types for the tissue. Use [] if
                       you want to supply more arguments)
   noGeneScore         score for reactions without genes (opt, default -2)
   multipleGeneScoring determines how scores are calculated for reactions
                       with several genes ('best' or 'average')
                       (opt, default 'best')
   multipleCellScoring determines how scores are calculated when several
                       cell types are used ('best' or 'average')
                       (opt, default 'best')
   hpaLevelScores      structure with numerical scores for the expression
                       level categories from HPA. The structure should have a
                       "names" and a "scores" field (opt, see code for
                       default scores)

   rxnScores       scores for each of the reactions in model
   geneScores      scores for each of the genes in model. Genes which are
                   not in the dataset(s) have -Inf as scores
   hpaScores       scores for each of the genes in model if only taking hpaData
                   into account. Genes which are not in the dataset(s) 
                   have -Inf as scores
   arrayScores     scores for each of the genes in model if only taking arrayData
                   into account. Genes which are not in the dataset(s) 
                   have -Inf as scores
       
   Usage: [rxnScores geneScores hpaScores arrayScores]=scoreModel(model,...
               hpaData,arrayData,tissue,celltype,noGeneScore,multipleGeneScoring,...
               multipleCellScoring,hpaLevelScores)

   Rasmus Agren, 2013-08-01

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001 function [rxnScores geneScores hpaScores arrayScores]=scoreModel(model,hpaData,arrayData,tissue,celltype,noGeneScore,multipleGeneScoring,multipleCellScoring,hpaLevelScores)
0002 % scoreRxns
0003 %   Scores the reactions and genes in a model based on expression data
0004 %   from HPA and/or gene arrays
0005 %
0006 %   model               a model structure
0007 %   hpaData             HPA data structure from parseHPA (opt if arrayData is
0008 %                       supplied, default [])
0009 %   arrayData           gene expression data structure (opt if hpaData is
0010 %                       supplied, default [])
0011 %       genes           cell array with the unique gene names
0012 %       tissues         cell array with the tissue names. The list may not be
0013 %                       unique, as there can be multiple cell types per tissue
0014 %       celltypes       cell array with the cell type names for each tissue
0015 %       levels          GENESxTISSUES array with the expression level for
0016 %                       each gene in each tissue/celltype. NaN should be
0017 %                       used when no measurement was performed
0018 %   tissue              tissue to score for. Should exist in either
0019 %                       hpaData.tissues or arrayData.tissues
0020 %   celltype            cell type to score for. Should exist in either
0021 %                       hpaData.celltypes or arrayData.celltypes for this
0022 %                       tissue (opt, default is to use the best values
0023 %                       among all the cell types for the tissue. Use [] if
0024 %                       you want to supply more arguments)
0025 %   noGeneScore         score for reactions without genes (opt, default -2)
0026 %   multipleGeneScoring determines how scores are calculated for reactions
0027 %                       with several genes ('best' or 'average')
0028 %                       (opt, default 'best')
0029 %   multipleCellScoring determines how scores are calculated when several
0030 %                       cell types are used ('best' or 'average')
0031 %                       (opt, default 'best')
0032 %   hpaLevelScores      structure with numerical scores for the expression
0033 %                       level categories from HPA. The structure should have a
0034 %                       "names" and a "scores" field (opt, see code for
0035 %                       default scores)
0036 %
0037 %   rxnScores       scores for each of the reactions in model
0038 %   geneScores      scores for each of the genes in model. Genes which are
0039 %                   not in the dataset(s) have -Inf as scores
0040 %   hpaScores       scores for each of the genes in model if only taking hpaData
0041 %                   into account. Genes which are not in the dataset(s)
0042 %                   have -Inf as scores
0043 %   arrayScores     scores for each of the genes in model if only taking arrayData
0044 %                   into account. Genes which are not in the dataset(s)
0045 %                   have -Inf as scores
0046 %
0047 %   Usage: [rxnScores geneScores hpaScores arrayScores]=scoreModel(model,...
0048 %               hpaData,arrayData,tissue,celltype,noGeneScore,multipleGeneScoring,...
0049 %               multipleCellScoring,hpaLevelScores)
0050 %
0051 %   Rasmus Agren, 2013-08-01
0052 %
0053 
0054 if nargin<3
0055     arrayData=[];
0056 end
0057 if nargin<5
0058     celltype=[];
0059 end
0060 if nargin<6
0061     noGeneScore=-2;
0062 end
0063 if nargin<7
0064     multipleGeneScoring='best';
0065 end
0066 if nargin<8
0067     multipleCellScoring='best';
0068 end
0069 if nargin<9
0070     %The first four are for APE, the other ones for staining
0071     hpaLevelScores.names={'High' 'Medium' 'Low' 'None' 'Strong' 'Moderate' 'Weak' 'Negative'};
0072     hpaLevelScores.scores=[20 15 10 -8 20 15 10 -8];
0073 end
0074 
0075 if isempty(hpaData) && isempty(arrayData)
0076     dispEM('Must supply hpaData, arrayData or both');    
0077 end
0078 if ~strcmpi(multipleGeneScoring,'best') && ~strcmpi(multipleGeneScoring,'average')
0079     dispEM('Valid options for multipleGeneScoring are "best" or "average"');    
0080 end
0081 if ~strcmpi(multipleCellScoring,'best') && ~strcmpi(multipleCellScoring,'average')
0082     dispEM('Valid options for multipleCellScoring are "best" or "average"');    
0083 end
0084 
0085 
0086 %Throw an error if array data for only one tissue is supplied
0087 if any(arrayData)
0088     if numel(arrayData.tissues)<2
0089         dispEM('arrayData must contain measurements for at least two celltypes/tissues since the score is calculated based on the expression level compared to the overall average'); 
0090     end
0091 end
0092 
0093 %This is so that the code can ignore which combination of input data that is
0094 %used
0095 if isempty(arrayData)
0096     arrayData.genes={};
0097     arrayData.tissues={};
0098     arrayData.celltypes={};
0099     arrayData.levels=[];
0100 end
0101 if isempty(hpaData)
0102     hpaData.genes={};
0103     hpaData.tissues={};
0104     hpaData.celltypes={};
0105     hpaData.levels={};
0106     hpaData.types={};
0107     hpaData.reliabilities={};
0108     hpaData.gene2Level=[];
0109     hpaData.gene2Type=[];
0110     hpaData.gene2Reliability=[];
0111 end
0112 
0113 %Check that the tissue exists
0114 if ~ismember(upper(tissue),upper(hpaData.tissues)) && ~ismember(upper(tissue),upper(arrayData.tissues))
0115     dispEM('The tissue name does not match');   
0116 end
0117 if any(celltype)
0118     %Check that both data types has cell type defined if that is to be used
0119     if ~isfield(hpaData,'celltypes') || ~isfield(arrayData,'celltypes')
0120         dispEM('Both hpaData and arrayData must contain cell type information if cell type is to be used');   
0121     end
0122     if ~ismember(upper(celltype),upper(hpaData.celltypes)) && ~ismember(upper(celltype),upper(arrayData.celltypes))
0123         dispEM('The cell type name does not match');   
0124     end
0125 end
0126 
0127 %Some preprocessing of the structures to speed up a little
0128 %Remove all tissues that are not the correct one
0129 J=~strcmpi(hpaData.tissues,tissue);
0130 
0131 %If cell type is supplied, then only keep that cell type
0132 if any(celltype)
0133     J=J | ~strcmpi(hpaData.celltypes,celltype);
0134 end
0135 
0136 hpaData.tissues(J)=[];
0137 if isfield(hpaData,'celltypes')
0138     hpaData.celltypes(J)=[];
0139 end
0140 if isfield(hpaData,'gene2Level')
0141     hpaData.gene2Level(:,J)=[];
0142 end
0143 if isfield(hpaData,'gene2Type')
0144     hpaData.gene2Type(:,J)=[];
0145 end
0146 if isfield(hpaData,'gene2Reliability')
0147     hpaData.gene2Reliability(:,J)=[];
0148 end
0149 
0150 %Remove all genes from the structures that are not in model or that aren't
0151 %measured in the tissue
0152 if ~isempty(hpaData.genes) %This should not be necessary, but the summation is a 0x1 matrix and the other is []
0153     I=~ismember(hpaData.genes,model.genes) | sum(hpaData.gene2Level,2)==0;
0154 else
0155     I=[];
0156 end
0157 hpaData.genes(I)=[];
0158 if isfield(hpaData,'gene2Level')
0159     hpaData.gene2Level(I,:)=[];
0160 end
0161 if isfield(hpaData,'gene2Type')
0162     hpaData.gene2Type(I,:)=[];
0163 end
0164 if isfield(hpaData,'gene2Reliability')
0165     hpaData.gene2Reliability(I,:)=[];
0166 end
0167 
0168 I=strcmpi(arrayData.tissues,tissue);
0169 %If cell type is supplied, then only keep that cell type
0170 if any(celltype)
0171     I=I & strcmpi(arrayData.celltypes,celltype);
0172 end
0173 
0174 %Remove all genes from the structures that are not in model or that aren't
0175 %measured in the tissue
0176 J=~ismember(arrayData.genes,model.genes) | myAll(isnan(arrayData.levels(:,I)),2);
0177 arrayData.genes(J)=[];
0178 arrayData.levels(J,:)=[];
0179 
0180 %Calculate the scores for the arrayData. These scores are
0181 %calculated for each genes from its fold change between the tissue/celltype(s) in
0182 %question and all other celltypes. This is a lower quality data than
0183 %protein abundance, since a gene that is equally highly expressed in all cell types
0184 %will have a score of 0.0. These scores are therefore only used for genes
0185 %for which there is no HPA data available. The fold changes are transformed
0186 %as min(log(x),10) for x>1 and max(log(x),-5) for x<1 in order to have
0187 %negative scores for lower expressed genes and to scale the scrores to have
0188 %somewhat lower weights than the HPA scores
0189 tempArrayLevels=arrayData.levels;
0190 tempArrayLevels(isnan(tempArrayLevels))=0;
0191 average=sum(tempArrayLevels,2)./sum(~isnan(arrayData.levels),2);
0192 if strcmpi(multipleCellScoring,'best')
0193     current=max(tempArrayLevels(:,I),[],2);
0194 else
0195     current=sum(tempArrayLevels(:,I),2)./sum(~isnan(arrayData.levels(:,I)),2);
0196 end
0197 if any(current)
0198     aScores=5*log(current./average);
0199 else
0200     aScores=[];
0201 end
0202 aScores(aScores>0)=min(aScores(aScores>0),10);
0203 aScores(aScores<0)=max(aScores(aScores<0),-5);
0204 
0205 %Map the HPA levels to scores
0206 [I J]=ismember(upper(hpaData.levels),upper(hpaLevelScores.names));
0207 if ~all(I)
0208     dispEM('There are expression level categories that do not match to hpaLevelScores');  
0209 end
0210 [K L M]=find(hpaData.gene2Level);
0211 scores=hpaLevelScores.scores(J);
0212 if strcmpi(multipleCellScoring,'best')
0213     hScores=max(sparse(K,L,scores(M),numel(hpaData.genes),numel(hpaData.tissues)),[],2);
0214 else
0215     hScores=mean(sparse(K,L,scores(M),numel(hpaData.genes),numel(hpaData.tissues)),2);
0216 end
0217 
0218 %Get the scores for the genes, only use HPA if available
0219 geneScores=inf(numel(model.genes),1)*-1;
0220 hpaScores=geneScores;
0221 arrayScores=geneScores;
0222 
0223 [I J]=ismember(model.genes,hpaData.genes);
0224 hpaScores(I)=hScores(J(I));
0225 geneScores(I)=hScores(J(I));
0226 [I J]=ismember(model.genes,arrayData.genes);
0227 arrayScores(I)=aScores(J(I));
0228 geneScores(I & myIsInf(geneScores))=aScores(J(I & myIsInf(geneScores)));
0229 
0230 %Remove the genes that have no data from the model
0231 I=ismember(model.genes,hpaData.genes) | ismember(model.genes,arrayData.genes);
0232 model.genes(~I)=[];
0233 model.rxnGeneMat(:,~I)=[];
0234 
0235 %Map the genes to the HPA/array genes
0236 [hpaExist hpaMap]=ismember(model.genes,hpaData.genes);
0237 [arrayExist arrayMap]=ismember(model.genes,arrayData.genes);
0238 
0239 %Set the default scores for reactions without genes
0240 rxnScores=ones(numel(model.rxns),1)*noGeneScore;
0241 
0242 %Loop through the reactions and calculate the scores
0243 for i=1:numel(model.rxns)
0244    %Check if it has genes
0245    I=find(model.rxnGeneMat(i,:));
0246    if any(I)
0247        %If any of the genes exist in hpaData, then don't use arrayData
0248        if any(hpaExist(I))
0249            %At least one gene was found in HPA
0250            if strcmpi(multipleGeneScoring,'best')
0251                rxnScores(i)=max(hScores(hpaMap(I(hpaExist(I)))));
0252            else
0253                rxnScores(i)=mean(hScores(hpaMap(I(hpaExist(I)))));
0254            end
0255        else
0256            %Use array data
0257            if any(arrayExist(I))
0258               %At least one gene was found in the array data
0259                if strcmpi(multipleGeneScoring,'best')
0260                    rxnScores(i)=max(aScores(arrayMap(I(arrayExist(I)))));
0261                else
0262                    rxnScores(i)=mean(aScores(arrayMap(I(arrayExist(I)))));
0263                end
0264            end
0265        end
0266    end
0267 end
0268 end
0269 
0270 %This is because isinf and all returns 0x1 for empty set, which gives a
0271 %concatenation error. Do like this instead of having many if statements
0272 function y=myIsInf(x)
0273     y=isinf(x);
0274     if isempty(y)
0275         y=[];
0276     end
0277 end
0278 function y=myAll(x,dim)
0279     y=all(x,dim);
0280     if isempty(y)
0281         y=[];
0282     end
0283 end

Generated on Mon 06-Jan-2014 14:58:12 by m2html © 2005