Home > RAVEN > parseFormulas.m

parseFormulas

PURPOSE ^

parseFormulas

SYNOPSIS ^

function [elements, useMat, exitFlag, MW]=parseFormulas(formulas, noPolymers,isInchi,ignoreRX)

DESCRIPTION ^

 parseFormulas
   Gets the elemental composition from formulas

   formulas      a cell array with formulas
   noPolymers    assume that all polymers consist of one element.
                 Corresponds to counting everything between (...)n as
                 n being equal to one. Only one set of parentheses
                 is allowed. If this is false then polymers are returned as
                 "Could not parse formula" (opt, default false)
   isInchi       true if the formulas are in the InChI format (opt,
                 default false)
   ignoreRX      ignore R-groups and bound protein. This can be useful since they
                 are often used only as intermediates (opt, default false)

   elements
       abbrevs   cell array with abbreviations for all used elements
       names     cell array with the names for all used elements
   useMat        MxN matrix with the number of atoms for each formula (M) and each
                 element (N)
   exitFlag      array with the exit flags:
                 1=  Sucessful parsing
                 0=  No formula found
                 -1= Could not parse formula
   MW            predicted molecular weight (g/mol). This is only returned
                 for formulas which can be sucessfully parsed, and its
                 calculation doesn't affect the exitFlag variable. NaN is
                 returned if the weight couldn't be calculated
   
   Usage: [elements, useMat, exitFlag, MW]=
               parseFormulas(formulas, noPolymers,isInchi,ignoreRX)

   Rasmus Agren, 2013-12-09

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function [elements, useMat, exitFlag, MW]=parseFormulas(formulas, noPolymers,isInchi,ignoreRX)
0002 % parseFormulas
0003 %   Gets the elemental composition from formulas
0004 %
0005 %   formulas      a cell array with formulas
0006 %   noPolymers    assume that all polymers consist of one element.
0007 %                 Corresponds to counting everything between (...)n as
0008 %                 n being equal to one. Only one set of parentheses
0009 %                 is allowed. If this is false then polymers are returned as
0010 %                 "Could not parse formula" (opt, default false)
0011 %   isInchi       true if the formulas are in the InChI format (opt,
0012 %                 default false)
0013 %   ignoreRX      ignore R-groups and bound protein. This can be useful since they
0014 %                 are often used only as intermediates (opt, default false)
0015 %
0016 %   elements
0017 %       abbrevs   cell array with abbreviations for all used elements
0018 %       names     cell array with the names for all used elements
0019 %   useMat        MxN matrix with the number of atoms for each formula (M) and each
0020 %                 element (N)
0021 %   exitFlag      array with the exit flags:
0022 %                 1=  Sucessful parsing
0023 %                 0=  No formula found
0024 %                 -1= Could not parse formula
0025 %   MW            predicted molecular weight (g/mol). This is only returned
0026 %                 for formulas which can be sucessfully parsed, and its
0027 %                 calculation doesn't affect the exitFlag variable. NaN is
0028 %                 returned if the weight couldn't be calculated
0029 %
0030 %   Usage: [elements, useMat, exitFlag, MW]=
0031 %               parseFormulas(formulas, noPolymers,isInchi,ignoreRX)
0032 %
0033 %   Rasmus Agren, 2013-12-09
0034 %
0035 
0036 if nargin<2
0037     noPolymers=false;
0038 end
0039 if nargin<3
0040     isInchi=false;
0041 end
0042 if nargin<4
0043     ignoreRX=false;
0044 end
0045 
0046 elements.abbrevs={'C', 'N', 'O', 'S', 'P', 'H', 'He', 'Li', 'Be', 'B', 'F', 'Ne', 'Na', 'Mg', 'Al',...
0047     'Si', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni',...
0048     'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc',...
0049     'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce',...
0050     'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta',...
0051     'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra',...
0052     'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',...
0053     'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'R', 'X'}';
0054 elements.names={'carbon', 'nitrogen', 'oxygen', 'sulfur', 'phosphorus', 'hydrogen', 'helium', 'lithium', 'beryllium', 'boron',...
0055     'fluorine', 'neon', 'sodium', 'magnesium', 'aluminum,', 'silicon',...
0056     'chlorine', 'argon', 'potassium', 'calcium', 'scandium', 'titanium', 'vanadium',...
0057     'chromium', 'manganese', 'iron', 'cobalt', 'nickel', 'copper', 'zinc', 'gallium', 'germanium',...
0058     'arsenic', 'selenium', 'bromine', 'krypton', 'rubidium', 'strontium', 'yttrium', 'zirconium',...
0059     'niobium', 'molybdenum', 'technetium', 'ruthenium', 'rhodium', 'palladium', 'silver', 'cadmium',...
0060     'indium', 'tin', 'antimony', 'tellurium', 'iodine', 'xenon', 'cesium', 'barium', 'lanthanum',...
0061     'cerium', 'praseodymium', 'neodymium', 'promethium', 'samarium', 'europium', 'gadolinium',...
0062     'terbium', 'dysprosium', 'holmium', 'erbium', 'thulium', 'ytterbium', 'lutetium', 'hafnium',...
0063     'tantalum', 'tungsten', 'rhenium', 'osmium', 'iridium', 'platinum', 'gold', 'mercury',...
0064     'thallium', 'lead', 'bismuth', 'polonium', 'astatine', 'radon', 'francium', 'radium',...
0065     'actinium', 'thorium', 'protactinium', 'uranium', 'neptunium', 'plutonium', 'americium',...
0066     'curium', 'berkelium', 'californium', 'einsteinium', 'fermium', 'mendelevium', 'nobelium',...
0067     'lawrencium', 'rutherfordium', 'dubnium', 'seaborgium', 'bohrium', 'hassium', 'meitnerium',...
0068     'darmstadtium', 'roentgenium', 'copernicium', 'generic group', 'bound protein'}';
0069 
0070 EWs=[12.0107 14.0067 15.9994 32.065 30.973762 1.00794 4.002602 6.941 9.012182 10.811 18.9984032 ...
0071 20.1797 22.98976928 24.305 26.9815386 28.0855 35.453 39.948 39.0983 40.078 44.955912 47.867 50.9415 ...
0072 51.9961 54.938045 55.845 58.933195 58.6934 63.546 65.39 69.723 72.64 74.9216 78.96 79.904 83.798 ...
0073 85.4678 87.62 88.90585 91.224 92.906 95.94 97.9072 101.07 102.905 106.42 107.8682 112.411 114.818 ...
0074 118.71 121.76 127.6 126.904 131.293 132.9054519 137.327 138.90547 140.116 140.90765 144.242 144.9127 ...
0075 150.36 151.964 157.25 158.92535 162.5 164.93 167.259 168.93421 173.04 174.967 178.49 180.94788 183.84 ...
0076 186.207 190.23 192.217 195.084 196.966569 200.59 204.3833 207.2 208.9804 208.9824 209.9871 222.0176 ...
0077 223.0197 226.0254 227.0277 232.03806 231.03588 238.02891 237.0482 244.0642 243.0614 247.0704 247.0703 ...
0078 251.0796 252.083 257.0951 258.0984 259.101 262.1097 261.1088 262 266 264 277 268 271 272 nan nan nan]';
0079 
0080 %Set the EWs of these groups to 0
0081 if ignoreRX==true
0082     EWs(end-1:end)=0;
0083 end
0084 
0085 useMat=zeros(numel(formulas),numel(elements.abbrevs));
0086 
0087 exitFlag=zeros(numel(formulas),1);
0088 
0089 %"H+", if parsed from InChI code, would have the composition "p+1". Change
0090 %to fit with how the rest of the compounds are written
0091 formulas=strrep(formulas,'p+1','H+');
0092 
0093 %Ignore charge to make the parsing easier
0094 formulas=strrep(formulas,'+','');
0095 formulas=strrep(formulas,'-','');
0096 
0097 %Loop through each formula
0098 for i=1:numel(formulas)
0099     if ~isempty(formulas{i})
0100         sucess=false; %To see if it works
0101         formula=formulas{i};
0102         
0103         %If it's an InChI code. The composition is found between the first and the second "/"
0104         %For some simple molecules such as salts only the first "/" is present
0105         if isInchi==true
0106             S=regexp(formula,'/','split');
0107             if numel(S)>=2
0108                 formula=S{2};
0109             else
0110                 formula='';
0111             end
0112         end
0113         %Only look at what's between the parantheses (polymers are not
0114         %supported in InChI)
0115         if isInchi==false
0116            LP=strfind(formula,'(');
0117            RP=strfind(formula,')n');
0118 %            if numel(LP)>1 || numel(RP)>1
0119 %               exitFlag(i)=-1;
0120 %               continue;
0121 %            end
0122 %            if numel(LP)>1 || numel(RP)>1
0123 %               exitFlag(i)=-1;
0124 %               continue;
0125 %            end
0126            if numel(LP)==1 && numel(RP)==1
0127                %This means that the polymer should be regarded as a monomer
0128                if noPolymers==true
0129                     %This means that there are one set of parantheses
0130                     formula=strrep(formula,'(','');
0131                     formula=strrep(formula,')n','');
0132                else
0133                     %This means that polymers should be ignored
0134                     exitFlag(i)=-1;
0135                     continue;
0136                end
0137            else
0138                if ~isempty(LP) || ~isempty(RP)
0139                     exitFlag(i)=-1;
0140                     continue;
0141                end
0142            end
0143         end
0144         
0145         %Get the indexes of the numeric (or ".") characters
0146         nonNumeric=false(numel(formula),1);
0147         nonNumeric(regexp(formula,'[^0-9.]'))=true;
0148         
0149         %Get the indexes of the upper characters (since each element starts
0150         %with an upper character)
0151         upperI=isstrprop(formula,'upper');
0152         upperX=find(upperI);
0153         
0154         for j=1:numel(upperX)
0155             %The first case is when it's the last character. Then the
0156             %coefficient must be 1
0157             isLast=false;
0158             if upperX(j)==numel(formula)
0159                coeff=1;
0160                element=formula(upperX(j));
0161                isLast=true;
0162             end
0163             
0164             if isLast==false
0165                 %The second case is when the following character is a character
0166                 if nonNumeric(upperX(j)+1)
0167                     %Is it a new element?
0168                     if upperI(upperX(j)+1)
0169                        %New element, that means that the coefficient was 1 and that
0170                        %the element was only one character
0171                        coeff=1;
0172                        element=formula(upperX(j));
0173                     else
0174                        %This means that it's an element with two characters
0175                        if j==numel(upperX)
0176                             if upperX(j)<numel(formula)-1
0177                                 coeff=str2double(formula(upperX(j)+2:end));
0178                             else
0179                                 coeff=1;
0180                             end
0181                        else
0182                             %Check if there is a number or a new element
0183                             %after it
0184                             if nonNumeric(upperX(j)+2)==true
0185                                 coeff=1;
0186                             else
0187                                 coeff=str2double(formula(upperX(j)+2:upperX(j+1)-1));
0188                             end
0189                        end
0190                        element=formula(upperX(j):upperX(j)+1);
0191                     end
0192                 else
0193                     %Then it is a numeral
0194                     if j==numel(upperX)
0195                         coeff=str2double(formula(upperX(j)+1:end));
0196                     else
0197                         coeff=str2double(formula(upperX(j)+1:upperX(j+1)-1));
0198                     end
0199                     element=formula(upperX(j));
0200                 end
0201             end
0202             
0203             %Find the element
0204             I=strcmp(element,elements.abbrevs);
0205             if any(I)
0206                 if ~isnan(coeff)
0207                     useMat(i,I)=useMat(i,I)+coeff;
0208                     sucess=true;
0209                 else
0210                     break;
0211                 end
0212             else
0213                 break;
0214             end
0215         end
0216         if sucess==false
0217             useMat(i,:)=0; %Reset for this formula
0218             exitFlag(i)=-1;
0219         else
0220             exitFlag(i)=1;
0221         end
0222     end
0223 end
0224 
0225 %Remove the elements which are never used
0226 I=~any(useMat);
0227 useMat(:,I)=[];
0228 elements.abbrevs(I)=[];
0229 elements.names(I)=[];
0230 EWs(I)=[];
0231 
0232 %Calcluate the weight contribution of each element in each formula. Note
0233 %that this will give NaNs for all formulas if R or X groups are in EWs,
0234 %since 0*NaN is still NaN. Therefore only use elements with known mass
0235 if nargout>3
0236     P=bsxfun(@times,useMat(:,~isnan(EWs)),EWs(~isnan(EWs)).');
0237     MW=sum(P,2);
0238 
0239     %Then remove the calculations for elements with unknown mass
0240     [I crap]=find(useMat(:,isnan(EWs)));
0241     MW(I)=nan;
0242     MW(exitFlag~=1)=nan;
0243 end
0244 end

Generated on Mon 06-Jan-2014 14:58:12 by m2html © 2005