Home > RAVEN > parseFormulas.m

parseFormulas

PURPOSE ^

parseFormulas

SYNOPSIS ^

function [nC, nP, nS, nN, nR, nO, nH, exitFlag]=parseFormulas(formulas, noPolymers,isInchi)

DESCRIPTION ^

 parseFormulas
   Gets the elemental composition from formulas

   formulas      a cell array with formulas
   noPolymers    assume that all polymers consist of one element.
                 Corresponds to counting everything between (...)n as
                 n being equal to one. Only one set of parentheses
                 is allowed (opt, default false)
   isInchi       true if the formulas are in the InChI format (opt,
                 default false)

   nC            Array with the carbon coefficients
   nP            Array with the phosphorus coefficients
   nS            Array with the sulfur coefficients
   nN            Array with the nitrogen coefficients
   nR            Array with the "general group" coefficents
   nO            Array with the oxygen coefficients
   nH            Array with the hydrogen coefficients
   exitFlag      Array with the exit flags:
                 1=  Sucessful parsing
                 0=  No formula found
                 -1= Could not parse formula

   Returns vectors corresponding to the coefficients for carbon, phosphate, 
   sulfur, nitrogen, "R-groups", oxygen, and hydrogen. All other compounds 
   are ignored.

   NOTE: Formulas which contain any two-character compounds (Zn, Fe and so on)
         might cause problems.

   Usage: [nC, nP, nS, nN, nR, nO, nH, exitFlag]=parseFormulas(formulas,
           noPolymers,isInchi)

   Rasmus Agren, 2012-12-18

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function [nC, nP, nS, nN, nR, nO, nH, exitFlag]=parseFormulas(formulas, noPolymers,isInchi)
0002 % parseFormulas
0003 %   Gets the elemental composition from formulas
0004 %
0005 %   formulas      a cell array with formulas
0006 %   noPolymers    assume that all polymers consist of one element.
0007 %                 Corresponds to counting everything between (...)n as
0008 %                 n being equal to one. Only one set of parentheses
0009 %                 is allowed (opt, default false)
0010 %   isInchi       true if the formulas are in the InChI format (opt,
0011 %                 default false)
0012 %
0013 %   nC            Array with the carbon coefficients
0014 %   nP            Array with the phosphorus coefficients
0015 %   nS            Array with the sulfur coefficients
0016 %   nN            Array with the nitrogen coefficients
0017 %   nR            Array with the "general group" coefficents
0018 %   nO            Array with the oxygen coefficients
0019 %   nH            Array with the hydrogen coefficients
0020 %   exitFlag      Array with the exit flags:
0021 %                 1=  Sucessful parsing
0022 %                 0=  No formula found
0023 %                 -1= Could not parse formula
0024 %
0025 %   Returns vectors corresponding to the coefficients for carbon, phosphate,
0026 %   sulfur, nitrogen, "R-groups", oxygen, and hydrogen. All other compounds
0027 %   are ignored.
0028 %
0029 %   NOTE: Formulas which contain any two-character compounds (Zn, Fe and so on)
0030 %         might cause problems.
0031 %
0032 %   Usage: [nC, nP, nS, nN, nR, nO, nH, exitFlag]=parseFormulas(formulas,
0033 %           noPolymers,isInchi)
0034 %
0035 %   Rasmus Agren, 2012-12-18
0036 %
0037 
0038 if nargin<2
0039     noPolymers=false;
0040 end
0041 if nargin<3
0042     isInchi=false;
0043 end
0044 
0045 %Only one-character abbreviations work at the moment!
0046 abbrevs=['C';'P';'S';'N';'R';'O';'H'];
0047 
0048 coeffs=zeros(numel(formulas),numel(abbrevs));
0049 exitFlag=zeros(numel(formulas),1);
0050 
0051 %Loop through all of the formulas. It might be possible to do this with
0052 %vectors, but I think this is more flexible
0053 for i=1:numel(formulas)
0054     foundError=false;
0055     
0056     %If it's empty, go to next loop
0057     if isempty(formulas{i})
0058        continue; 
0059     end
0060     
0061     if isInchi==false
0062         %Check if it's a polymer (has "(" and ")n")
0063         startPol=strfind(formulas{i},'(');
0064         endPol=strfind(formulas{i},')n');
0065 
0066         if isempty(startPol) && isempty(endPol)
0067             splitComp=formulas(i);
0068         else
0069             %There must be only one set of parentheses and noPolymers must be
0070             %true
0071             if numel(startPol)==1 && numel(endPol)==1 && startPol(1) < endPol(1)...
0072                     && noPolymers==true
0073                 %Parse the composition
0074                 splitComp{1}=formulas{i}(1:startPol-1);
0075                 splitComp{2}=formulas{i}(startPol+1:endPol-1);
0076                 splitComp{3}=formulas{i}(endPol+2:numel(formulas{i}));
0077             else
0078                 exitFlag(i)=-1;
0079                 continue;
0080             end
0081         end
0082     else
0083         %If it's an InChI code. Polymers are not supported in this format,
0084         %so no checks have to be made
0085         %The composition is found between the first and the second "/"
0086         S=regexp(formulas{i},'/','split');
0087         if numel(S)>2
0088             splitComp=S(2);
0089         else
0090             splitComp={''};
0091             if numel(S)==2
0092                 %This can happen for simple ions (I think when there are no
0093                 %bindings in the molecule). Normally this doesn't have to be dealt
0094                 %with in any specific way, but one important exception is the
0095                 %proton, which is stored as "p+1" rather than as "H+" which
0096                 %would fit with the rest of the parsing
0097                 if strcmpi(S(2),'p+1')
0098                     splitComp={'H+'};
0099                 end
0100             end
0101         end
0102     end
0103     
0104     %Loop through the parts and sum the coefficients
0105     for j=1:numel(splitComp)
0106        %Get the indexes of all non-numeric characters (or ".")
0107        nonNumeric=regexp(splitComp{j},'[^0-9.]');
0108        if ~isempty(splitComp{j})
0109            %Loop through each element
0110            for k=1:numel(abbrevs)
0111                 index=strfind(splitComp{j},abbrevs(k));
0112                 
0113                 %Only one element should have been found
0114                 if numel(index)==1
0115                     %Get what should be the coefficient
0116                     endIndex=find(nonNumeric>index,1);
0117                     if ~isempty(endIndex)
0118                         endChar=nonNumeric(endIndex)-1;
0119                     else
0120                         endChar=numel(splitComp{j});
0121                     end
0122                     
0123                     %Assume that the coefficient is 1 if followed by a
0124                     %non-numeric character
0125                     if endChar==index
0126                         coeffs(i,k)=coeffs(i,k)+1;
0127                     else
0128                         co=str2num(splitComp{j}(index+1:endChar));
0129                         
0130                         %This should represent a number. If it doesn't then
0131                         %something is wrong
0132                         if ~isempty(co)
0133                             coeffs(i,k)=coeffs(i,k)+co;
0134                         else
0135                             foundError=true;
0136                             break;
0137                         end
0138                     end
0139                 else
0140                     if numel(index)>1
0141                         %Can't proceed if it's more than one
0142                         foundError=true;
0143                         break;
0144                     end
0145                 end
0146            end
0147        end
0148        if foundError==true
0149            break;
0150        end
0151     end
0152     if foundError==true
0153        exitFlag(i)=-1;
0154        coeffs(i,:)=0;
0155     else
0156         exitFlag(i)=1;
0157     end
0158 end
0159 
0160 nC=coeffs(:,1);
0161 nP=coeffs(:,2);
0162 nS=coeffs(:,3);
0163 nN=coeffs(:,4);
0164 nR=coeffs(:,5);
0165 nO=coeffs(:,6);
0166 nH=coeffs(:,7);
0167 end

Generated on Tue 23-Apr-2013 15:18:37 by m2html © 2005