Home > RAVEN > getBlastFromExcel.m

getBlastFromExcel

PURPOSE ^

getBlastFromExcel

SYNOPSIS ^

function blastStructure=getBlastFromExcel(models,blastFile,organismId)

DESCRIPTION ^

 getBlastFromExcel
   Retrieves gene homology information from Excel files. Used as
   input to getModelFromHomology.

   models          a cell array of model structures
   blastFile       Excel file with homology information
   organismId      the id of the organism of interest (as described in the
                   Excel file)

   blastStructure  structure containing the information in the Excel
                   sheets.

   The Excel file should contain a number of spreadsheets which in turn 
   contain the bidirectional homology measurements between the genes in the 
   organisms. The first and second column headers in each sheet is the 
   "to" and "from" model ids (as defined in models or for the new organism).
   The entries should correspond to the gene names in those models. The third, 
   fourth, and fifth columns represent the E-value, alignment length, and 
   identity for each measurement (captions should be "E-value", "Alignment length",
   and "Identity").

   Usage: blastStructure=getBlastFromExcel(models,blastFile,organismId)

   Rasmus Agren, 2012-03-20

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function blastStructure=getBlastFromExcel(models,blastFile,organismId)
0002 % getBlastFromExcel
0003 %   Retrieves gene homology information from Excel files. Used as
0004 %   input to getModelFromHomology.
0005 %
0006 %   models          a cell array of model structures
0007 %   blastFile       Excel file with homology information
0008 %   organismId      the id of the organism of interest (as described in the
0009 %                   Excel file)
0010 %
0011 %   blastStructure  structure containing the information in the Excel
0012 %                   sheets.
0013 %
0014 %   The Excel file should contain a number of spreadsheets which in turn
0015 %   contain the bidirectional homology measurements between the genes in the
0016 %   organisms. The first and second column headers in each sheet is the
0017 %   "to" and "from" model ids (as defined in models or for the new organism).
0018 %   The entries should correspond to the gene names in those models. The third,
0019 %   fourth, and fifth columns represent the E-value, alignment length, and
0020 %   identity for each measurement (captions should be "E-value", "Alignment length",
0021 %   and "Identity").
0022 %
0023 %   Usage: blastStructure=getBlastFromExcel(models,blastFile,organismId)
0024 %
0025 %   Rasmus Agren, 2012-03-20
0026 %
0027 
0028 blastStructure=[];
0029 
0030 %Get a list of model IDs
0031 organisms=cell(numel(models)+1,1);
0032 organisms{1}=organismId;
0033 for i=1:numel(models)
0034     organisms{i+1}=models{i}.id;
0035 end
0036 
0037 %Get all the spreadsheets in the file
0038 [type, sheets]=xlsfinfo(blastFile);
0039 
0040 %Check if the file is a Microsoft Excel Spreadsheet
0041 if ~strcmp(type,'Microsoft Excel Spreadsheet')
0042     throw(MException('','The file is not a Microsoft Excel Spreadsheet'));
0043 end
0044 
0045 for i=1:numel(sheets)
0046     %Check if the sheet has the right header and deal with organisms that
0047     %are in "models"
0048     [values,dataSheet]=xlsread(blastFile,i);
0049     labels=dataSheet(1,:);
0050     if strcmpi(labels{3},'E-value') && strcmpi(labels{4},'Alignment length') && strcmpi(labels{5},'Identity')
0051         %At least one of the organisms must have a model
0052         fromID=find(strcmpi(labels{1},organisms));
0053         toID=find(strcmpi(labels{2},organisms));
0054         %Check that the organism ids exist and that one of them is the
0055         %organism of interest
0056         if any(fromID) && any(toID) && (toID==1 || fromID==1)
0057             %Check that no gene ids are empty. This could for example be
0058             %the case if the gene names are wrongly formatted as numbers
0059             %instead of strings
0060             emptyNames=cellfun(@isempty,dataSheet(2:end,1)) | cellfun(@isempty,dataSheet(2:end,2));
0061             if any(emptyNames)
0062                 if all(emptyNames)
0063                     throw(MException('',['Only empty gene names in sheet from ' organisms{fromID} ' to ' organisms{toID}]));
0064                 else    
0065                     fprintf(['WARNING: Empty gene names in sheet from ' organisms{fromID} ' to ' organisms{toID} '. Ignoring genes with empty names\n']);
0066                 end
0067             end
0068             blastStructure(numel(blastStructure)+1).toId=organisms{toID};
0069             blastStructure(numel(blastStructure)).fromId=organisms{fromID};
0070             blastStructure(numel(blastStructure)).fromGenes=dataSheet(2:end,1);
0071             blastStructure(numel(blastStructure)).toGenes=dataSheet(2:end,2);
0072             blastStructure(numel(blastStructure)).evalue=values(:,1);
0073             blastStructure(numel(blastStructure)).aligLen=values(:,2);
0074             blastStructure(numel(blastStructure)).identity=values(:,3);
0075             
0076             %Remove matches where any of the values is NaN. This would have
0077             %been done anyways in getModelFromHomology, but it's neater to
0078             %do it here
0079             I=isnan(blastStructure(end).evalue) | isnan(blastStructure(end).aligLen) | isnan(blastStructure(end).identity);
0080             blastStructure(end).fromGenes(I)=[];
0081             blastStructure(end).toGenes(I)=[];
0082             blastStructure(end).evalue(I)=[];
0083             blastStructure(end).aligLen(I)=[];
0084             blastStructure(end).identity(I)=[];
0085         else
0086             if isempty(toID) || isempty(fromID)
0087                 fprintf('The data in sheet %s has no corresponding model. Ignoring sheet\n',sheets{i});
0088             else
0089                 fprintf('The data in sheet %s does not involve the organism of interest. Ignoring sheet\n',sheets{i});
0090             end
0091         end
0092     else
0093         fprintf('WARNING: The data in sheet %s is not correctly formatted. Ignoring sheet\n',sheets{i});
0094     end
0095 end
0096 
0097 end

Generated on Tue 16-Jul-2013 21:50:02 by m2html © 2005