0001 function [elements, useMat, exitFlag, MW]=parseFormulas(formulas, noPolymers,isInchi,ignoreRX)
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036 if nargin<2
0037 noPolymers=false;
0038 end
0039 if nargin<3
0040 isInchi=false;
0041 end
0042 if nargin<4
0043 ignoreRX=false;
0044 end
0045
0046 elements.abbrevs={'C', 'N', 'O', 'S', 'P', 'H', 'He', 'Li', 'Be', 'B', 'F', 'Ne', 'Na', 'Mg', 'Al',...
0047 'Si', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni',...
0048 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc',...
0049 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce',...
0050 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta',...
0051 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra',...
0052 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',...
0053 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'R', 'X'}';
0054 elements.names={'carbon', 'nitrogen', 'oxygen', 'sulfur', 'phosphorus', 'hydrogen', 'helium', 'lithium', 'beryllium', 'boron',...
0055 'fluorine', 'neon', 'sodium', 'magnesium', 'aluminum,', 'silicon',...
0056 'chlorine', 'argon', 'potassium', 'calcium', 'scandium', 'titanium', 'vanadium',...
0057 'chromium', 'manganese', 'iron', 'cobalt', 'nickel', 'copper', 'zinc', 'gallium', 'germanium',...
0058 'arsenic', 'selenium', 'bromine', 'krypton', 'rubidium', 'strontium', 'yttrium', 'zirconium',...
0059 'niobium', 'molybdenum', 'technetium', 'ruthenium', 'rhodium', 'palladium', 'silver', 'cadmium',...
0060 'indium', 'tin', 'antimony', 'tellurium', 'iodine', 'xenon', 'cesium', 'barium', 'lanthanum',...
0061 'cerium', 'praseodymium', 'neodymium', 'promethium', 'samarium', 'europium', 'gadolinium',...
0062 'terbium', 'dysprosium', 'holmium', 'erbium', 'thulium', 'ytterbium', 'lutetium', 'hafnium',...
0063 'tantalum', 'tungsten', 'rhenium', 'osmium', 'iridium', 'platinum', 'gold', 'mercury',...
0064 'thallium', 'lead', 'bismuth', 'polonium', 'astatine', 'radon', 'francium', 'radium',...
0065 'actinium', 'thorium', 'protactinium', 'uranium', 'neptunium', 'plutonium', 'americium',...
0066 'curium', 'berkelium', 'californium', 'einsteinium', 'fermium', 'mendelevium', 'nobelium',...
0067 'lawrencium', 'rutherfordium', 'dubnium', 'seaborgium', 'bohrium', 'hassium', 'meitnerium',...
0068 'darmstadtium', 'roentgenium', 'copernicium', 'generic group', 'bound protein'}';
0069
0070 EWs=[12.0107 14.0067 15.9994 32.065 30.973762 1.00794 4.002602 6.941 9.012182 10.811 18.9984032 ...
0071 20.1797 22.98976928 24.305 26.9815386 28.0855 35.453 39.948 39.0983 40.078 44.955912 47.867 50.9415 ...
0072 51.9961 54.938045 55.845 58.933195 58.6934 63.546 65.39 69.723 72.64 74.9216 78.96 79.904 83.798 ...
0073 85.4678 87.62 88.90585 91.224 92.906 95.94 97.9072 101.07 102.905 106.42 107.8682 112.411 114.818 ...
0074 118.71 121.76 127.6 126.904 131.293 132.9054519 137.327 138.90547 140.116 140.90765 144.242 144.9127 ...
0075 150.36 151.964 157.25 158.92535 162.5 164.93 167.259 168.93421 173.04 174.967 178.49 180.94788 183.84 ...
0076 186.207 190.23 192.217 195.084 196.966569 200.59 204.3833 207.2 208.9804 208.9824 209.9871 222.0176 ...
0077 223.0197 226.0254 227.0277 232.03806 231.03588 238.02891 237.0482 244.0642 243.0614 247.0704 247.0703 ...
0078 251.0796 252.083 257.0951 258.0984 259.101 262.1097 261.1088 262 266 264 277 268 271 272 nan nan nan]';
0079
0080
0081 if ignoreRX==true
0082 EWs(end-1:end)=0;
0083 end
0084
0085 useMat=zeros(numel(formulas),numel(elements.abbrevs));
0086
0087 exitFlag=zeros(numel(formulas),1);
0088
0089
0090
0091 formulas=strrep(formulas,'p+1','H+');
0092
0093
0094 formulas=strrep(formulas,'+','');
0095 formulas=strrep(formulas,'-','');
0096
0097
0098 for i=1:numel(formulas)
0099 if ~isempty(formulas{i})
0100 sucess=false;
0101 formula=formulas{i};
0102
0103
0104
0105 if isInchi==true
0106 S=regexp(formula,'/','split');
0107 if numel(S)>=2
0108 formula=S{2};
0109 else
0110 formula='';
0111 end
0112 end
0113
0114
0115 if isInchi==false
0116 LP=strfind(formula,'(');
0117 RP=strfind(formula,')n');
0118
0119
0120
0121
0122
0123
0124
0125
0126 if numel(LP)==1 && numel(RP)==1
0127
0128 if noPolymers==true
0129
0130 formula=strrep(formula,'(','');
0131 formula=strrep(formula,')n','');
0132 else
0133
0134 exitFlag(i)=-1;
0135 continue;
0136 end
0137 else
0138 if ~isempty(LP) || ~isempty(RP)
0139 exitFlag(i)=-1;
0140 continue;
0141 end
0142 end
0143 end
0144
0145
0146 nonNumeric=false(numel(formula),1);
0147 nonNumeric(regexp(formula,'[^0-9.]'))=true;
0148
0149
0150
0151 upperI=isstrprop(formula,'upper');
0152 upperX=find(upperI);
0153
0154 for j=1:numel(upperX)
0155
0156
0157 isLast=false;
0158 if upperX(j)==numel(formula)
0159 coeff=1;
0160 element=formula(upperX(j));
0161 isLast=true;
0162 end
0163
0164 if isLast==false
0165
0166 if nonNumeric(upperX(j)+1)
0167
0168 if upperI(upperX(j)+1)
0169
0170
0171 coeff=1;
0172 element=formula(upperX(j));
0173 else
0174
0175 if j==numel(upperX)
0176 if upperX(j)<numel(formula)-1
0177 coeff=str2double(formula(upperX(j)+2:end));
0178 else
0179 coeff=1;
0180 end
0181 else
0182
0183
0184 if nonNumeric(upperX(j)+2)==true
0185 coeff=1;
0186 else
0187 coeff=str2double(formula(upperX(j)+2:upperX(j+1)-1));
0188 end
0189 end
0190 element=formula(upperX(j):upperX(j)+1);
0191 end
0192 else
0193
0194 if j==numel(upperX)
0195 coeff=str2double(formula(upperX(j)+1:end));
0196 else
0197 coeff=str2double(formula(upperX(j)+1:upperX(j+1)-1));
0198 end
0199 element=formula(upperX(j));
0200 end
0201 end
0202
0203
0204 I=strcmp(element,elements.abbrevs);
0205 if any(I)
0206 if ~isnan(coeff)
0207 useMat(i,I)=useMat(i,I)+coeff;
0208 sucess=true;
0209 else
0210 break;
0211 end
0212 else
0213 break;
0214 end
0215 end
0216 if sucess==false
0217 useMat(i,:)=0;
0218 exitFlag(i)=-1;
0219 else
0220 exitFlag(i)=1;
0221 end
0222 end
0223 end
0224
0225
0226 I=~any(useMat);
0227 useMat(:,I)=[];
0228 elements.abbrevs(I)=[];
0229 elements.names(I)=[];
0230 EWs(I)=[];
0231
0232
0233
0234
0235 if nargout>3
0236 P=bsxfun(@times,useMat(:,~isnan(EWs)),EWs(~isnan(EWs)).');
0237 MW=sum(P,2);
0238
0239
0240 [I crap]=find(useMat(:,isnan(EWs)));
0241 MW(I)=nan;
0242 MW(exitFlag~=1)=nan;
0243 end
0244 end