correldata
Read/write vectors of correlated data from/to a csv file.
These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.
1""" 2Read/write vectors of correlated data from/to a csv file. 3 4These data are stored in a dictionary, whose values are numpy arrays 5with elements which may be strings, floats, or floats with associated uncertainties 6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library. 7""" 8 9 10__author__ = 'Mathieu Daëron' 11__contact__ = 'mathieu@daeron.fr' 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron' 13__license__ = 'MIT License - https://opensource.org/licenses/MIT' 14__date__ = '2024-10-13' 15__version__ = '1.2.3' 16 17 18import os as _os 19import numpy as _np 20import uncertainties as _uc 21 22from typing import Callable, Hashable, Any 23 24class uarray(_np.ndarray): 25 26 __doc__ = """ 27 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 28 of [ufloat](https://pypi.org/project/uncertainties) values 29 """ 30 31 def __new__(cls, a): 32 obj = _np.asarray(a).view(cls) 33 return obj 34 35 n = property(fget = _np.vectorize(lambda x : x.n)) 36 """Return the array of nominal values (read-only).""" 37 38 s = property(fget = _np.vectorize(lambda x : x.s)) 39 """Return the array of standard errors (read-only)""" 40 41 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x))) 42 """Return the correlation matrix of the array elements (read-only)""" 43 44 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x))) 45 """Return the covariance matrix of the array elements (read-only)""" 46 47 nv = n 48 "Alias for `uarray.nv`" 49 50 se = s 51 "Alias for `uarray.s`" 52 53 cor = correl 54 "Alias for `uarray.correl`" 55 56 cov = covar 57 "Alias for `uarray.covar`" 58 59 60def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 61 ''' 62 Test whether 2-D array `M` is symmetric and positive semidefinite. 63 ''' 64 ev = _np.linalg.eigvals(M) 65 return ( 66 _np.allclose(M, M.T) # M is symmetric 67 and _np.all( 68 (ev > 0) | _np.isclose(ev, 0) 69 ) # all eignevalues are either real and strictly positive or close to zero 70 ) 71 72 73def smart_type(x: str): 74 ''' 75 Tries to convert string `x` to a float if it includes a decimal point, or 76 to an integer if it does not. If both attempts fail, return the original 77 string unchanged. 78 ''' 79 try: 80 y = float(x) 81 except ValueError: 82 return x 83 if y % 1 == 0 and '.' not in x: 84 return int(y) 85 return y 86 87 88def read_data(data: str, sep: str = ',', validate_covar: bool = True): 89 ''' 90 Read correlated data from a CSV-like string. 91 92 Column names are interpreted in the following way: 93 * In most cases, each columns is converted to a dict value, with the corresponding 94 dict key being the column's label. 95 * Columns whose label starts with `SE` are interpreted as specifying the standard 96 error for the latest preceding data column. 97 * Columns whose label starts with `correl` are interpreted as specifying the 98 correlation matrix for the latest preceding data column. In that case, column labels 99 are ignored for the rest of the columns belonging to this matrix. 100 * Columns whose label starts with `covar` are interpreted as specifying the 101 covariance matrix for the latest preceding data column. In that case, column labels 102 are ignored for the rest of the columns belonging to this matrix. 103 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 104 the latest preceding data column, by adding an underscore followed by the variable's 105 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 106 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 107 underscore followed by the two variable labels, joined by a second underscore 108 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 109 correspond, respectively, to the lines and columns of this matrix. 110 * Exceptions will be raised, for any given variable: 111 - when specifying both `covar` and any combination of (`SE`, `correl`) 112 - when specifying `correl` without `SE` 113 114 **Arguments** 115 - `data`: a CSV-like string 116 - `sep`: the CSV separator 117 - `validate_covar`: whether to check that the overall covariance matrix 118 is symmetric and positive semidefinite. Specifying `validate_covar = False` 119 bypasses this computationally expensive step. 120 121 **Example** 122 ```py 123 import correldata 124 data = """ 125 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 126 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 127 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 128 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 129 """[1:-1] 130 print(correldata.read_data(data)) 131 132 # yields: 133 # 134 # > { 135 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 136 # 'Tacid': array([90., 90., 90.]), 137 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 138 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 139 # } 140 ``` 141 ''' 142 143 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 144 N = len(data) - 1 145 146 values, se, correl, covar = {}, {}, {}, {} 147 j = 0 148 while j < len(data[0]): 149 field = data[0][j] 150 if not ( 151 field.startswith('SE_') 152 or field.startswith('correl_') 153 or field.startswith('covar_') 154 or field == 'SE' 155 or field == 'correl' 156 or field == 'covar' 157 or len(field) == 0 158 ): 159 values[field] = _np.array([l[j] for l in data[1:]]) 160 j += 1 161 oldfield = field 162 elif field.startswith('SE_'): 163 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 164 j += 1 165 elif field == 'SE': 166 se[oldfield] = _np.array([l[j] for l in data[1:]]) 167 j += 1 168 elif field.startswith('correl_'): 169 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 170 j += N 171 elif field == 'correl': 172 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 173 j += N 174 elif field.startswith('covar_'): 175 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 176 j += N 177 elif field == 'covar': 178 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 179 j += N 180 181 nakedvalues = {} 182 for k in [_ for _ in values]: 183 if ( 184 k not in se 185 and k not in correl 186 and k not in covar 187 ): 188 nakedvalues[k] = values.pop(k) 189 190 for x in values: 191 if x in covar: 192 if x in se: 193 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 194 if x in correl: 195 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 196 if x in correl: 197 if x not in se: 198 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 199 200 for x in correl: 201 if x in values: 202 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 203 else: 204 for x1 in values: 205 for x2 in values: 206 if x == f'{x1}_{x2}': 207 if x1 in se: 208 se1 = se[x1] 209 else: 210 if x1 in covar: 211 se1 = _np.diag(covar[x1])**0.5 212 else: 213 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 214 if x2 in se: 215 se2 = se[x2] 216 else: 217 if x2 in covar: 218 se2 = _np.diag(covar[x2])**0.5 219 else: 220 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 221 222 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 223 224 for x in se: 225 if x in values and x not in correl: 226 covar[x] = _np.diag(se[x]**2) 227 228 for k in [_ for _ in covar]: 229 if k not in values: 230 for j1 in values: 231 for j2 in values: 232 if k == f'{j1}_{j2}': 233 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 234 235 X = _np.array([_ for k in values for _ in values[k]]) 236 CM = _np.zeros((X.size, X.size)) 237 for i, vi in enumerate(values): 238 for j, vj in enumerate(values): 239 if vi == vj: 240 if vi in covar: 241 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 242 else: 243 if f'{vi}_{vj}' in covar: 244 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 245 246 s = _np.diag(CM)**.5 247 s[s==0] = 1. 248 invs = _np.diag(s**-1) 249 250 if ( 251 validate_covar 252 and not ( 253 is_symmetric_positive_semidefinite(CM) 254 or is_symmetric_positive_semidefinite(invs @ CM @ invs) 255 ) 256 ): 257 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 258 259 corvalues = uarray(_uc.correlated_values(X, CM)) 260 261 allvalues = nakedvalues 262 263 for i, x in enumerate(values): 264 allvalues[x] = corvalues[i*N:i*N+N] 265 266 return allvalues 267 268 269def read_data_from_file(filename: str | _os.PathLike, **kwargs): 270 ''' 271 Read correlated data from a CSV file. 272 273 **Arguments** 274 - `filename`: `str` or path to the file to read from 275 - `kwargs`: passed to correldata.read_data() 276 ''' 277 with open(filename) as fid: 278 return read_data(fid.read(), **kwargs) 279 280 281def f2s( 282 x: Any, 283 f: (str | Callable | dict), 284 k: Hashable = None, 285 fb: (str | Callable) = 'z.6g', 286) -> str: 287 ''' 288 Format `x` according to format `f` 289 290 * If `f` is a string, return `f'{x:{f}}'` 291 * If `f` is a callable, return `f(x)` 292 * If `f` is a dict and optional argument `k` is a hashable, 293 return f2s(x, f[k]), otherwise return f2s(x, fb) 294 ''' 295 296 if isinstance (x, str): 297 return x 298 if isinstance (f, str): 299 return f'{x:{f}}' 300 if isinstance (f, Callable): 301 return f(x) 302 if isinstance (f, dict): 303 if k in f: 304 return f2s(x, f[k]) 305 if isinstance (fb, str): 306 return f'{x:{fb}}' 307 if isinstance (fb, Callable): 308 return fb(x) 309 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.') 310 311 312 313def data_string( 314 data: dict, 315 sep: str = ',', 316 include_fields: list = None, 317 exclude_fields: list = [], 318 float_format: (str | dict | Callable) = 'z.6g', 319 correl_format: (str | dict | Callable) = 'z.6f', 320 default_float_format: (str | Callable) = 'z.6g', 321 default_correl_format: (str | Callable) = 'z.6f', 322 align: str = '>', 323 atol: float = 1e-12, 324 rtol: float = 1e-12, 325): 326 ''' 327 Generate CSV-like string from correlated data 328 329 **Arguments** 330 - `data`: dict of arrays with strings, floats or correlated data 331 - `sep`: the CSV separator 332 - `include_fields`: subset of fields to write; if `None`, write all fields 333 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 334 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 335 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 336 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 337 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 338 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 339 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 340 missing from `float_format.keys()` will use `default_float_format` instead. 341 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 342 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 343 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 344 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 345 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 346 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 347 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 348 349 350 **Example** 351 352 ```py 353 from correldata import _uc 354 from correldata import _np 355 from correldata import * 356 357 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 358 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 359 360 data = dict(X=X, Y=Y, Z=X+Y) 361 362 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 363 364 # yields: 365 # 366 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 367 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 368 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 369 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 370 ``` 371 ''' 372 if include_fields is None: 373 include_fields = [_ for _ in data] 374 cols, ufields = [], [] 375 for f in include_fields: 376 if f in exclude_fields: 377 continue 378 if isinstance(data[f], uarray): 379 ufields.append(f) 380 N = data[f].size 381 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 382 if f'SE_{f}' not in exclude_fields: 383 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 384 if f'correl_{f}' not in exclude_fields: 385 CM = _uc.correlation_matrix(data[f]) 386 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 387 for i in range(N): 388 cols.append( 389 ['' if i else f'correl_{f}'] 390 + [ 391 f2s( 392 CM[i,j], 393 correl_format, 394 f, 395 default_correl_format, 396 ) 397 for j in range(N) 398 ] 399 ) 400 401 else: 402 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 403 404 for i in range(len(ufields)): 405 for j in range(i): 406 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 407 continue 408 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 409 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 410 for k in range(N): 411 cols.append( 412 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 413 + [ 414 f2s( 415 CM[k,l], 416 correl_format, 417 f, 418 default_correl_format, 419 ) 420 for l in range(N) 421 ] 422 ) 423 424 lines = list(map(list, zip(*cols))) 425 426 if align: 427 lengths = [max([len(e) for e in l]) for l in cols] 428 for l in lines: 429 for k,ln in enumerate(lengths): 430 l[k] = f'{l[k]:{align}{ln}s}' 431 return '\n'.join([(sep+' ').join(l) for l in lines]) 432 433 return '\n'.join([sep.join(l) for l in lines]) 434 435 436 437def save_data_to_file(data, filename, **kwargs): 438 ''' 439 Write correlated data to a CSV file. 440 441 **Arguments** 442 - `data`: dict of arrays with strings, floats or correlated data 443 - `filename`: `str` or path to the file to read from 444 - `kwargs`: passed to correldata.data_string() 445 ''' 446 with open(filename, 'w') as fid: 447 return fid.write(data_string(data, **kwargs))
25class uarray(_np.ndarray): 26 27 __doc__ = """ 28 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 29 of [ufloat](https://pypi.org/project/uncertainties) values 30 """ 31 32 def __new__(cls, a): 33 obj = _np.asarray(a).view(cls) 34 return obj 35 36 n = property(fget = _np.vectorize(lambda x : x.n)) 37 """Return the array of nominal values (read-only).""" 38 39 s = property(fget = _np.vectorize(lambda x : x.s)) 40 """Return the array of standard errors (read-only)""" 41 42 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x))) 43 """Return the correlation matrix of the array elements (read-only)""" 44 45 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x))) 46 """Return the covariance matrix of the array elements (read-only)""" 47 48 nv = n 49 "Alias for `uarray.nv`" 50 51 se = s 52 "Alias for `uarray.s`" 53 54 cor = correl 55 "Alias for `uarray.correl`" 56 57 cov = covar 58 "Alias for `uarray.covar`"
42 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
Return the correlation matrix of the array elements (read-only)
45 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
Return the covariance matrix of the array elements (read-only)
42 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
Alias for uarray.correl
45 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
Alias for uarray.covar
Inherited Members
- numpy.ndarray
- dumps
- dump
- all
- any
- argmax
- argmin
- argpartition
- argsort
- astype
- byteswap
- choose
- clip
- compress
- conj
- conjugate
- copy
- cumprod
- cumsum
- diagonal
- dot
- fill
- flatten
- getfield
- item
- max
- mean
- min
- nonzero
- partition
- prod
- put
- ravel
- repeat
- reshape
- resize
- round
- searchsorted
- setfield
- setflags
- sort
- squeeze
- std
- sum
- swapaxes
- take
- tobytes
- tofile
- tolist
- tostring
- trace
- transpose
- var
- view
- to_device
- ndim
- flags
- shape
- strides
- data
- itemsize
- size
- nbytes
- base
- dtype
- real
- imag
- flat
- ctypes
- T
- mT
- ptp
- newbyteorder
- itemset
- device
61def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 62 ''' 63 Test whether 2-D array `M` is symmetric and positive semidefinite. 64 ''' 65 ev = _np.linalg.eigvals(M) 66 return ( 67 _np.allclose(M, M.T) # M is symmetric 68 and _np.all( 69 (ev > 0) | _np.isclose(ev, 0) 70 ) # all eignevalues are either real and strictly positive or close to zero 71 )
Test whether 2-D array M is symmetric and positive semidefinite.
74def smart_type(x: str): 75 ''' 76 Tries to convert string `x` to a float if it includes a decimal point, or 77 to an integer if it does not. If both attempts fail, return the original 78 string unchanged. 79 ''' 80 try: 81 y = float(x) 82 except ValueError: 83 return x 84 if y % 1 == 0 and '.' not in x: 85 return int(y) 86 return y
Tries to convert string x to a float if it includes a decimal point, or
to an integer if it does not. If both attempts fail, return the original
string unchanged.
89def read_data(data: str, sep: str = ',', validate_covar: bool = True): 90 ''' 91 Read correlated data from a CSV-like string. 92 93 Column names are interpreted in the following way: 94 * In most cases, each columns is converted to a dict value, with the corresponding 95 dict key being the column's label. 96 * Columns whose label starts with `SE` are interpreted as specifying the standard 97 error for the latest preceding data column. 98 * Columns whose label starts with `correl` are interpreted as specifying the 99 correlation matrix for the latest preceding data column. In that case, column labels 100 are ignored for the rest of the columns belonging to this matrix. 101 * Columns whose label starts with `covar` are interpreted as specifying the 102 covariance matrix for the latest preceding data column. In that case, column labels 103 are ignored for the rest of the columns belonging to this matrix. 104 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 105 the latest preceding data column, by adding an underscore followed by the variable's 106 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 107 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 108 underscore followed by the two variable labels, joined by a second underscore 109 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 110 correspond, respectively, to the lines and columns of this matrix. 111 * Exceptions will be raised, for any given variable: 112 - when specifying both `covar` and any combination of (`SE`, `correl`) 113 - when specifying `correl` without `SE` 114 115 **Arguments** 116 - `data`: a CSV-like string 117 - `sep`: the CSV separator 118 - `validate_covar`: whether to check that the overall covariance matrix 119 is symmetric and positive semidefinite. Specifying `validate_covar = False` 120 bypasses this computationally expensive step. 121 122 **Example** 123 ```py 124 import correldata 125 data = """ 126 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 127 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 128 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 129 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 130 """[1:-1] 131 print(correldata.read_data(data)) 132 133 # yields: 134 # 135 # > { 136 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 137 # 'Tacid': array([90., 90., 90.]), 138 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 139 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 140 # } 141 ``` 142 ''' 143 144 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 145 N = len(data) - 1 146 147 values, se, correl, covar = {}, {}, {}, {} 148 j = 0 149 while j < len(data[0]): 150 field = data[0][j] 151 if not ( 152 field.startswith('SE_') 153 or field.startswith('correl_') 154 or field.startswith('covar_') 155 or field == 'SE' 156 or field == 'correl' 157 or field == 'covar' 158 or len(field) == 0 159 ): 160 values[field] = _np.array([l[j] for l in data[1:]]) 161 j += 1 162 oldfield = field 163 elif field.startswith('SE_'): 164 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 165 j += 1 166 elif field == 'SE': 167 se[oldfield] = _np.array([l[j] for l in data[1:]]) 168 j += 1 169 elif field.startswith('correl_'): 170 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 171 j += N 172 elif field == 'correl': 173 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 174 j += N 175 elif field.startswith('covar_'): 176 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 177 j += N 178 elif field == 'covar': 179 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 180 j += N 181 182 nakedvalues = {} 183 for k in [_ for _ in values]: 184 if ( 185 k not in se 186 and k not in correl 187 and k not in covar 188 ): 189 nakedvalues[k] = values.pop(k) 190 191 for x in values: 192 if x in covar: 193 if x in se: 194 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 195 if x in correl: 196 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 197 if x in correl: 198 if x not in se: 199 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 200 201 for x in correl: 202 if x in values: 203 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 204 else: 205 for x1 in values: 206 for x2 in values: 207 if x == f'{x1}_{x2}': 208 if x1 in se: 209 se1 = se[x1] 210 else: 211 if x1 in covar: 212 se1 = _np.diag(covar[x1])**0.5 213 else: 214 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 215 if x2 in se: 216 se2 = se[x2] 217 else: 218 if x2 in covar: 219 se2 = _np.diag(covar[x2])**0.5 220 else: 221 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 222 223 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 224 225 for x in se: 226 if x in values and x not in correl: 227 covar[x] = _np.diag(se[x]**2) 228 229 for k in [_ for _ in covar]: 230 if k not in values: 231 for j1 in values: 232 for j2 in values: 233 if k == f'{j1}_{j2}': 234 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 235 236 X = _np.array([_ for k in values for _ in values[k]]) 237 CM = _np.zeros((X.size, X.size)) 238 for i, vi in enumerate(values): 239 for j, vj in enumerate(values): 240 if vi == vj: 241 if vi in covar: 242 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 243 else: 244 if f'{vi}_{vj}' in covar: 245 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 246 247 s = _np.diag(CM)**.5 248 s[s==0] = 1. 249 invs = _np.diag(s**-1) 250 251 if ( 252 validate_covar 253 and not ( 254 is_symmetric_positive_semidefinite(CM) 255 or is_symmetric_positive_semidefinite(invs @ CM @ invs) 256 ) 257 ): 258 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 259 260 corvalues = uarray(_uc.correlated_values(X, CM)) 261 262 allvalues = nakedvalues 263 264 for i, x in enumerate(values): 265 allvalues[x] = corvalues[i*N:i*N+N] 266 267 return allvalues
Read correlated data from a CSV-like string.
Column names are interpreted in the following way:
- In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
- Columns whose label starts with
SEare interpreted as specifying the standard error for the latest preceding data column. - Columns whose label starts with
correlare interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. - Columns whose label starts with
covarare interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. SE,correl, andcovarmay be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex:SE_foo,correl_bar,covar_baz).correl, andcovarmay also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex:correl_foo_bar,covar_X_Y). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.- Exceptions will be raised, for any given variable:
- when specifying both
covarand any combination of (SE,correl) - when specifying
correlwithoutSE
- when specifying both
Arguments
data: a CSV-like stringsep: the CSV separatorvalidate_covar: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifyingvalidate_covar = Falsebypasses this computationally expensive step.
Example
import correldata
data = """
Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48
FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0
BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0
BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5
"""[1:-1]
print(read_data(data))
# yields:
#
# > {
# 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
# 'Tacid': array([90., 90., 90.]),
# 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
# 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
# }
270def read_data_from_file(filename: str | _os.PathLike, **kwargs): 271 ''' 272 Read correlated data from a CSV file. 273 274 **Arguments** 275 - `filename`: `str` or path to the file to read from 276 - `kwargs`: passed to correldata.read_data() 277 ''' 278 with open(filename) as fid: 279 return read_data(fid.read(), **kwargs)
Read correlated data from a CSV file.
Arguments
filename:stror path to the file to read fromkwargs: passed to read_data()
282def f2s( 283 x: Any, 284 f: (str | Callable | dict), 285 k: Hashable = None, 286 fb: (str | Callable) = 'z.6g', 287) -> str: 288 ''' 289 Format `x` according to format `f` 290 291 * If `f` is a string, return `f'{x:{f}}'` 292 * If `f` is a callable, return `f(x)` 293 * If `f` is a dict and optional argument `k` is a hashable, 294 return f2s(x, f[k]), otherwise return f2s(x, fb) 295 ''' 296 297 if isinstance (x, str): 298 return x 299 if isinstance (f, str): 300 return f'{x:{f}}' 301 if isinstance (f, Callable): 302 return f(x) 303 if isinstance (f, dict): 304 if k in f: 305 return f2s(x, f[k]) 306 if isinstance (fb, str): 307 return f'{x:{fb}}' 308 if isinstance (fb, Callable): 309 return fb(x) 310 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
Format x according to format f
- If
fis a string, returnf'{x:{f}}' - If
fis a callable, returnf(x) - If
fis a dict and optional argumentkis a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
314def data_string( 315 data: dict, 316 sep: str = ',', 317 include_fields: list = None, 318 exclude_fields: list = [], 319 float_format: (str | dict | Callable) = 'z.6g', 320 correl_format: (str | dict | Callable) = 'z.6f', 321 default_float_format: (str | Callable) = 'z.6g', 322 default_correl_format: (str | Callable) = 'z.6f', 323 align: str = '>', 324 atol: float = 1e-12, 325 rtol: float = 1e-12, 326): 327 ''' 328 Generate CSV-like string from correlated data 329 330 **Arguments** 331 - `data`: dict of arrays with strings, floats or correlated data 332 - `sep`: the CSV separator 333 - `include_fields`: subset of fields to write; if `None`, write all fields 334 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 335 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 336 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 337 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 338 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 339 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 340 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 341 missing from `float_format.keys()` will use `default_float_format` instead. 342 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 343 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 344 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 345 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 346 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 347 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 348 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 349 350 351 **Example** 352 353 ```py 354 from correldata import _uc 355 from correldata import _np 356 from correldata import * 357 358 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 359 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 360 361 data = dict(X=X, Y=Y, Z=X+Y) 362 363 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 364 365 # yields: 366 # 367 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 368 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 369 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 370 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 371 ``` 372 ''' 373 if include_fields is None: 374 include_fields = [_ for _ in data] 375 cols, ufields = [], [] 376 for f in include_fields: 377 if f in exclude_fields: 378 continue 379 if isinstance(data[f], uarray): 380 ufields.append(f) 381 N = data[f].size 382 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 383 if f'SE_{f}' not in exclude_fields: 384 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 385 if f'correl_{f}' not in exclude_fields: 386 CM = _uc.correlation_matrix(data[f]) 387 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 388 for i in range(N): 389 cols.append( 390 ['' if i else f'correl_{f}'] 391 + [ 392 f2s( 393 CM[i,j], 394 correl_format, 395 f, 396 default_correl_format, 397 ) 398 for j in range(N) 399 ] 400 ) 401 402 else: 403 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 404 405 for i in range(len(ufields)): 406 for j in range(i): 407 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 408 continue 409 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 410 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 411 for k in range(N): 412 cols.append( 413 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 414 + [ 415 f2s( 416 CM[k,l], 417 correl_format, 418 f, 419 default_correl_format, 420 ) 421 for l in range(N) 422 ] 423 ) 424 425 lines = list(map(list, zip(*cols))) 426 427 if align: 428 lengths = [max([len(e) for e in l]) for l in cols] 429 for l in lines: 430 for k,ln in enumerate(lengths): 431 l[k] = f'{l[k]:{align}{ln}s}' 432 return '\n'.join([(sep+' ').join(l) for l in lines]) 433 434 return '\n'.join([sep.join(l) for l in lines])
Generate CSV-like string from correlated data
Arguments
data: dict of arrays with strings, floats or correlated datasep: the CSV separatorinclude_fields: subset of fields to write; ifNone, write all fieldsexclude_fields: subset of fields to ignore (takes precedence overinclude_fields); to exclude only the SE for fieldfoo, includeSE_foo; same goes forcorrel_foofloat_format: formatting for float values. May be a string (ex:'z.3f'), a callable (ex:lambda x: '.2f' if x else '0'), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex:{'foo': '.2e', 'bar': (lambda x: str(x))}).correl_format: same asfloat_format, but applies to correlation matrix elementsdefault_float_format: only used whenfloat_formatis a dict; in that case, fields missing fromfloat_format.keys()will usedefault_float_formatinstead. corresponding to different fields (ex:{'foo': '.2e', 'bar':lambda x: str(x)}).default_correl_format: same asdefault_float_format, but applies tocorrel_formatalign: right-align (>), left-align (<), or don't align (empty string) CSV valuesatol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrixrtol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
Example
from correldata import _uc
from correldata import _np
from correldata import *
X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
data = dict(X=X, Y=Y, Z=X+Y)
print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
# yields:
#
# X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, ,
# 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0
# 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0
# 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8
438def save_data_to_file(data, filename, **kwargs): 439 ''' 440 Write correlated data to a CSV file. 441 442 **Arguments** 443 - `data`: dict of arrays with strings, floats or correlated data 444 - `filename`: `str` or path to the file to read from 445 - `kwargs`: passed to correldata.data_string() 446 ''' 447 with open(filename, 'w') as fid: 448 return fid.write(data_string(data, **kwargs))
Write correlated data to a CSV file.
Arguments
data: dict of arrays with strings, floats or correlated datafilename:stror path to the file to read fromkwargs: passed to data_string()