Home > RAVEN > constructMultiFasta.m

constructMultiFasta

PURPOSE ^

constructMultiFasta

SYNOPSIS ^

function constructMultiFasta(model,sourceFile,outputDir)

DESCRIPTION ^

 constructMultiFasta 
   Saves one file in FASTA format for each reaction in the model that has genes

   model         a model structure
   sourceFile    a file with sequences in FASTA format
   outputDir     the directory to save the resulting FASTA files in

   The source file is assumed to have the format '>gene identifier
   additional info'. Only the gene identifier is used for matching. This is
   to be compatible with the rest of the code that retrieves information
   from KEGG.

   Usage: constructMultiFasta(model,sourceFile,outputDir)

   Rasmus Agren, 2012-12-16

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function constructMultiFasta(model,sourceFile,outputDir)
0002 % constructMultiFasta
0003 %   Saves one file in FASTA format for each reaction in the model that has genes
0004 %
0005 %   model         a model structure
0006 %   sourceFile    a file with sequences in FASTA format
0007 %   outputDir     the directory to save the resulting FASTA files in
0008 %
0009 %   The source file is assumed to have the format '>gene identifier
0010 %   additional info'. Only the gene identifier is used for matching. This is
0011 %   to be compatible with the rest of the code that retrieves information
0012 %   from KEGG.
0013 %
0014 %   Usage: constructMultiFasta(model,sourceFile,outputDir)
0015 %
0016 %   Rasmus Agren, 2012-12-16
0017 %
0018 
0019 %Open the source file
0020 fid = fopen(sourceFile, 'r');
0021 
0022 %Since the list of genes will be accessed many times I use a Java hashtable
0023 hTable = java.util.Hashtable;
0024 
0025 for i=1:numel(model.genes)
0026     hTable.put(model.genes{i}, i);
0027 end
0028 
0029 %First go through the source file and save the position (in bytes) of the
0030 %start of each gene
0031 elementPositions=zeros(5000000,1,'int64'); %Make room for 5000000 elements
0032 totalElements=0;
0033 whereAmI=0; %Keeps track of where in the file we are
0034 while 1
0035     %Read 10 mb at a time
0036     str=fread(fid,10000000,'int8');
0037     
0038     %Find any '>' which indicates a new label in FASTA format
0039     newPosts=find(str==62); %62 is '>'
0040     
0041     elementPositions(totalElements+1:totalElements+numel(newPosts))=whereAmI+newPosts;
0042     
0043     totalElements=totalElements+numel(newPosts);
0044     
0045     whereAmI=whereAmI+10000000;
0046     
0047     if feof(fid)
0048         break;
0049     end
0050 end
0051 elementPositions=elementPositions(1:totalElements);
0052 fprintf('Completed scanning of source file\n');
0053 
0054 %Now loop through the file to see which genes are present in the gene list
0055 %and save their position IN elementPositions! This is to enable a easy way
0056 %to get the distance to the following element
0057 genePositions=zeros(numel(model.genes),1);
0058 for i=1:numel(elementPositions)
0059     fseek(fid,elementPositions(i),-1);
0060     str=fread(fid,[1 30],'*char'); %Assumes that no ID is longer than 20 characters
0061     delim=find(str==32 | str==10,1,'first'); %Space or line feed
0062    
0063     geneIdentifier=str(1:delim-1);
0064     
0065     %This should never happen, but just to prevent errors. Could be that
0066     %'>' is a part of some gene information. An alternative would be to
0067     %check that the indexes follows a line feed
0068     if isempty(geneIdentifier)
0069         continue; 
0070     end
0071    
0072     %If not found it means that the id was too long
0073     if isempty(delim)
0074         throw(MException('','Too long gene identifier, increase read length')); 
0075     end
0076    
0077     %See if the gene was found
0078     id=hTable.get(geneIdentifier);
0079    
0080     if any(id)
0081         if genePositions(id)==0
0082             genePositions(id)=i;
0083         end
0084     end
0085 end
0086 
0087 fprintf('Completed mapping of genes to source file\n');
0088 
0089 %Loop through the reactions and print the corresponding sequences
0090 for i=1:numel(model.rxns)
0091     %Don't overwrite existing files
0092     if ~exist(fullfile(outputDir,[model.rxns{i} '.fa']), 'file')
0093         
0094         %Get the positions in elementPositions for the involved genes
0095         genesUsed=model.rxnGeneMat(i,:);
0096         
0097         %Open a file for this reaction. This saves empty files for KOs
0098         %without genes
0099         rxnfid=fopen(fullfile(outputDir,[model.rxns{i} '.fa']),'w');
0100             
0101         if any(genesUsed)
0102             positions=genePositions(genesUsed~=0);
0103 
0104             %It could be that some genes were not found. Delete those elements
0105             positions(positions==0)=[];
0106 
0107             %Print each sequence
0108             for j=1:numel(positions)
0109                 fseek(fid,elementPositions(positions(j)),-1);
0110                 %Should check that it ends with a gene!!!!
0111                 %Check for eof
0112                 if positions(j)<numel(elementPositions)
0113                     str=fread(fid,[1 double(elementPositions(positions(j)+1))-double(elementPositions(positions(j)))-1],'*char');
0114 
0115                     %If the string doesn't end with a line feed character
0116                     if str(end)~=10
0117                         str=[str fread(fid,[1 double(elementPositions(positions(j)+2))-double(elementPositions(positions(j)+1))],'*char')];
0118 
0119                         %This is if we still haven't found a new gene.
0120                         %Maximal unluck. This whole check should be done
0121                         %when elementPositions are calculated!
0122                         if str(end)~=10
0123                             %Skip this gene
0124                             continue;
0125                         end
0126                     end
0127                 else
0128                     str=fread(fid,[1 inf],'*char');
0129                 end
0130                 fwrite(rxnfid,['>' str]);
0131             end
0132         end
0133         fclose(rxnfid);
0134     end
0135 end
0136 
0137 %Close the source file
0138 fclose(fid);
0139 end

Generated on Tue 23-Apr-2013 15:18:37 by m2html © 2005