correldata

Read/write vectors of correlated data from/to a csv file.

These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.

  1"""
  2Read/write vectors of correlated data from/to a csv file.
  3
  4These data are stored in a dictionary, whose values are numpy arrays
  5with elements which may be strings, floats, or floats with associated uncertainties
  6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library.
  7"""
  8
  9
 10__author__    = 'Mathieu Daëron'
 11__contact__   = 'mathieu@daeron.fr'
 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron'
 13__license__   = 'MIT License - https://opensource.org/licenses/MIT'
 14__date__      = '2024-10-13'
 15__version__   = '1.2.3'
 16
 17
 18import os as _os
 19import numpy as _np
 20import uncertainties as _uc
 21
 22from typing import Callable, Hashable, Any
 23
 24class uarray(_np.ndarray):
 25
 26    __doc__ = """
 27    1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
 28    of [ufloat](https://pypi.org/project/uncertainties) values
 29    """
 30
 31    def __new__(cls, a):
 32        obj = _np.asarray(a).view(cls)
 33        return obj
 34    
 35    n = property(fget = _np.vectorize(lambda x : x.n))
 36    """Return the array of nominal values (read-only)."""
 37
 38    s = property(fget = _np.vectorize(lambda x : x.s))
 39    """Return the array of standard errors (read-only)"""
 40
 41    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
 42    """Return the correlation matrix of the array elements (read-only)"""
 43
 44    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
 45    """Return the covariance matrix of the array elements (read-only)"""
 46
 47    nv = n
 48    "Alias for `uarray.nv`"
 49
 50    se = s
 51    "Alias for `uarray.s`"
 52
 53    cor = correl
 54    "Alias for `uarray.correl`"
 55
 56    cov = covar
 57    "Alias for `uarray.covar`"    
 58
 59
 60def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
 61	'''
 62	Test whether 2-D array `M` is symmetric and positive semidefinite.
 63	'''
 64	ev = _np.linalg.eigvals(M)
 65	return (
 66		_np.allclose(M, M.T) # M is symmetric
 67		and _np.all(
 68			(ev > 0) | _np.isclose(ev, 0)
 69		) # all eignevalues are either real and strictly positive or close to zero
 70	)
 71
 72
 73def smart_type(x: str):
 74	'''
 75	Tries to convert string `x` to a float if it includes a decimal point, or
 76	to an integer if it does not. If both attempts fail, return the original
 77	string unchanged.
 78	'''
 79	try:
 80		y = float(x)
 81	except ValueError:
 82		return x
 83	if y % 1 == 0 and '.' not in x:
 84		return int(y)
 85	return y
 86
 87
 88def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 89	'''
 90	Read correlated data from a CSV-like string.
 91	
 92	Column names are interpreted in the following way:
 93	* In most cases, each columns is converted to a dict value, with the corresponding
 94	dict key being the column's label.
 95	* Columns whose label starts with `SE` are interpreted as specifying the standard
 96	error for the latest preceding data column.
 97	* Columns whose label starts with `correl` are interpreted as specifying the
 98	correlation matrix for the latest preceding data column. In that case, column labels
 99	are ignored for the rest of the columns belonging to this matrix.
100	* Columns whose label starts with `covar` are interpreted as specifying the
101	covariance matrix for the latest preceding data column. In that case, column labels
102	are ignored for the rest of the columns belonging to this matrix.
103	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
104	the latest preceding data column, by adding an underscore followed by the variable's
105	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
106	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
107	underscore followed by the two variable labels, joined by a second underscore
108	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
109	correspond, respectively, to the lines and columns of this matrix.
110	* Exceptions will be raised, for any given variable:
111		- when specifying both `covar` and any combination of (`SE`, `correl`)
112		- when specifying `correl` without `SE`
113
114	**Arguments**
115	- `data`: a CSV-like string
116	- `sep`: the CSV separator
117	- `validate_covar`: whether to check that the overall covariance matrix
118	is symmetric and positive semidefinite. Specifying `validate_covar = False`
119	bypasses this computationally expensive step.
120	
121	**Example**
122	```py
123	import correldata
124	data  = """
125	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
126	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
127	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
128	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
129	"""[1:-1]
130	print(correldata.read_data(data))
131	
132	# yields:
133	# 
134	# > {
135	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
136	#     'Tacid': array([90., 90., 90.]),
137	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
138	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
139	#   }
140	```
141	'''
142
143	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
144	N = len(data) - 1
145
146	values, se, correl, covar = {}, {}, {}, {}
147	j = 0
148	while j < len(data[0]):
149		field = data[0][j]
150		if not (
151			field.startswith('SE_')
152			or field.startswith('correl_')
153			or field.startswith('covar_')
154			or field == 'SE'
155			or field == 'correl'
156			or field == 'covar'
157			or len(field) == 0
158		):
159			values[field] = _np.array([l[j] for l in data[1:]])
160			j += 1
161			oldfield = field
162		elif field.startswith('SE_'):
163			se[field[3:]] = _np.array([l[j] for l in data[1:]])
164			j += 1
165		elif field == 'SE':
166			se[oldfield] = _np.array([l[j] for l in data[1:]])
167			j += 1
168		elif field.startswith('correl_'):
169			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
170			j += N
171		elif field == 'correl':
172			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
173			j += N
174		elif field.startswith('covar_'):
175			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
176			j += N
177		elif field == 'covar':
178			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
179			j += N
180
181	nakedvalues = {}
182	for k in [_ for _ in values]:
183		if (
184			k not in se
185			and k not in correl
186			and k not in covar
187		):
188			nakedvalues[k] = values.pop(k)
189
190	for x in values:
191		if x in covar:
192			if x in se:
193				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
194			if x in correl:
195				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
196		if x in correl:
197			if x not in se:
198				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
199
200	for x in correl:
201		if x in values:
202			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
203		else:
204			for x1 in values:
205				for x2 in values:
206					if x == f'{x1}_{x2}':
207						if x1 in se:
208							se1 = se[x1]
209						else:
210							if x1 in covar:
211								se1 = _np.diag(covar[x1])**0.5
212							else:
213								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
214						if x2 in se:
215							se2 = se[x2]
216						else:
217							if x2 in covar:
218								se2 = _np.diag(covar[x2])**0.5
219							else:
220								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
221
222						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
223
224	for x in se:
225		if x in values and x not in correl:
226			covar[x] = _np.diag(se[x]**2)
227
228	for k in [_ for _ in covar]:
229		if k not in values:
230			for j1 in values:
231				for j2 in values:
232					if k == f'{j1}_{j2}':
233						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
234
235	X = _np.array([_ for k in values for _ in values[k]])
236	CM = _np.zeros((X.size, X.size))
237	for i, vi in enumerate(values):
238		for j, vj in enumerate(values):
239			if vi == vj:
240				if vi in covar:
241					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
242			else:
243				if f'{vi}_{vj}' in covar:
244					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
245
246	s = _np.diag(CM)**.5
247	s[s==0] = 1.
248	invs = _np.diag(s**-1)
249
250	if (
251		validate_covar
252		and not (
253			is_symmetric_positive_semidefinite(CM)
254			or is_symmetric_positive_semidefinite(invs @ CM @ invs)
255		)
256	):
257		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
258
259	corvalues = uarray(_uc.correlated_values(X, CM))
260
261	allvalues = nakedvalues
262
263	for i, x in enumerate(values):
264		allvalues[x] = corvalues[i*N:i*N+N]
265
266	return allvalues
267
268
269def read_data_from_file(filename: str | _os.PathLike, **kwargs):
270	'''
271	Read correlated data from a CSV file.
272
273	**Arguments**
274	- `filename`: `str` or path to the file to read from
275	- `kwargs`: passed to correldata.read_data()
276	'''
277	with open(filename) as fid:
278		return read_data(fid.read(), **kwargs)
279
280
281def f2s(
282	x: Any,
283	f: (str | Callable | dict),
284	k: Hashable = None,
285	fb: (str | Callable) = 'z.6g',
286) -> str:
287	'''
288	Format `x` according to format `f`
289	
290	* If `f` is a string, return `f'{x:{f}}'`
291	* If `f` is a callable, return `f(x)`
292	* If `f` is a dict and optional argument `k` is a hashable,
293	  return f2s(x, f[k]), otherwise return f2s(x, fb)
294	'''
295
296	if isinstance (x, str):
297		return x
298	if isinstance (f, str):
299		return f'{x:{f}}'
300	if isinstance (f, Callable):
301		return f(x)
302	if isinstance (f, dict):
303		if k in f:
304			return f2s(x, f[k])
305		if isinstance (fb, str):
306			return f'{x:{fb}}'
307		if isinstance (fb, Callable):
308			return fb(x)
309	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
310	
311
312
313def data_string(
314	data: dict,
315	sep: str = ',',
316	include_fields: list = None,
317	exclude_fields: list = [],
318	float_format: (str | dict | Callable) = 'z.6g',
319	correl_format: (str | dict | Callable) = 'z.6f',
320	default_float_format: (str | Callable) = 'z.6g',
321	default_correl_format: (str | Callable) = 'z.6f',
322	align: str = '>',
323	atol: float = 1e-12,
324	rtol: float = 1e-12,
325):
326	'''
327	Generate CSV-like string from correlated data
328
329	**Arguments**
330	- `data`: dict of arrays with strings, floats or correlated data
331	- `sep`: the CSV separator
332	- `include_fields`: subset of fields to write; if `None`, write all fields
333	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
334	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
335	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
336	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
337	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
338	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
339	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
340	  missing from `float_format.keys()` will use `default_float_format` instead.
341	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
342	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
343	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
344	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
345	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
346	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
347	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
348	
349	
350	**Example**
351	
352	```py
353	from correldata import _uc
354	from correldata import _np
355	from correldata import *
356	
357	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
358	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
359	
360	data = dict(X=X, Y=Y, Z=X+Y)
361	
362	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
363	
364	# yields:
365	# 
366	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
367	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
368	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
369	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
370	```
371	'''
372	if include_fields is None:
373		include_fields = [_ for _ in data]
374	cols, ufields = [], []
375	for f in include_fields:
376		if f in exclude_fields:
377			continue
378		if isinstance(data[f], uarray):
379			ufields.append(f)
380			N = data[f].size
381			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
382			if f'SE_{f}' not in exclude_fields:
383				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
384			if f'correl_{f}' not in exclude_fields:
385				CM = _uc.correlation_matrix(data[f])
386				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
387					for i in range(N):
388						cols.append(
389							['' if i else f'correl_{f}']
390							+ [
391								f2s(
392									CM[i,j],
393									correl_format,
394									f,
395									default_correl_format,
396								)
397								for j in range(N)
398							]
399						)
400
401		else:
402			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
403
404	for i in range(len(ufields)):
405		for j in range(i):
406			if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
407				continue
408			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
409			if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
410				for k in range(N):
411					cols.append(
412						['' if k else f'correl_{ufields[j]}_{ufields[i]}']
413						+ [
414							f2s(
415								CM[k,l],
416								correl_format,
417								f,
418								default_correl_format,
419							)
420							for l in range(N)
421						]
422					)
423
424	lines = list(map(list, zip(*cols)))
425
426	if align:
427		lengths = [max([len(e) for e in l]) for l in cols]
428		for l in lines:
429			for k,ln in enumerate(lengths):
430				l[k] = f'{l[k]:{align}{ln}s}'
431		return '\n'.join([(sep+' ').join(l) for l in lines])
432
433	return '\n'.join([sep.join(l) for l in lines])
434
435
436
437def save_data_to_file(data, filename, **kwargs):
438	'''
439	Write correlated data to a CSV file.
440
441	**Arguments**
442	- `data`: dict of arrays with strings, floats or correlated data
443	- `filename`: `str` or path to the file to read from
444	- `kwargs`: passed to correldata.data_string()
445	'''
446	with open(filename, 'w') as fid:
447		return fid.write(data_string(data, **kwargs))
class uarray(numpy.ndarray):
25class uarray(_np.ndarray):
26
27    __doc__ = """
28    1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
29    of [ufloat](https://pypi.org/project/uncertainties) values
30    """
31
32    def __new__(cls, a):
33        obj = _np.asarray(a).view(cls)
34        return obj
35    
36    n = property(fget = _np.vectorize(lambda x : x.n))
37    """Return the array of nominal values (read-only)."""
38
39    s = property(fget = _np.vectorize(lambda x : x.s))
40    """Return the array of standard errors (read-only)"""
41
42    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
43    """Return the correlation matrix of the array elements (read-only)"""
44
45    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
46    """Return the covariance matrix of the array elements (read-only)"""
47
48    nv = n
49    "Alias for `uarray.nv`"
50
51    se = s
52    "Alias for `uarray.s`"
53
54    cor = correl
55    "Alias for `uarray.correl`"
56
57    cov = covar
58    "Alias for `uarray.covar`"    

1-D ndarray of ufloat values

n

Return the array of nominal values (read-only).

s

Return the array of standard errors (read-only)

correl
42    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))

Return the correlation matrix of the array elements (read-only)

covar
45    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))

Return the covariance matrix of the array elements (read-only)

nv

Alias for uarray.nv

se

Alias for uarray.s

cor
42    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))

Alias for uarray.correl

cov
45    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))

Alias for uarray.covar

Inherited Members
numpy.ndarray
dumps
dump
all
any
argmax
argmin
argpartition
argsort
astype
byteswap
choose
clip
compress
conj
conjugate
copy
cumprod
cumsum
diagonal
dot
fill
flatten
getfield
item
max
mean
min
nonzero
partition
prod
put
ravel
repeat
reshape
resize
round
searchsorted
setfield
setflags
sort
squeeze
std
sum
swapaxes
take
tobytes
tofile
tolist
tostring
trace
transpose
var
view
to_device
ndim
flags
shape
strides
data
itemsize
size
nbytes
base
dtype
real
imag
flat
ctypes
T
mT
ptp
newbyteorder
itemset
device
def is_symmetric_positive_semidefinite(M: numpy.ndarray) -> bool:
61def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
62	'''
63	Test whether 2-D array `M` is symmetric and positive semidefinite.
64	'''
65	ev = _np.linalg.eigvals(M)
66	return (
67		_np.allclose(M, M.T) # M is symmetric
68		and _np.all(
69			(ev > 0) | _np.isclose(ev, 0)
70		) # all eignevalues are either real and strictly positive or close to zero
71	)

Test whether 2-D array M is symmetric and positive semidefinite.

def smart_type(x: str):
74def smart_type(x: str):
75	'''
76	Tries to convert string `x` to a float if it includes a decimal point, or
77	to an integer if it does not. If both attempts fail, return the original
78	string unchanged.
79	'''
80	try:
81		y = float(x)
82	except ValueError:
83		return x
84	if y % 1 == 0 and '.' not in x:
85		return int(y)
86	return y

Tries to convert string x to a float if it includes a decimal point, or to an integer if it does not. If both attempts fail, return the original string unchanged.

def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 89def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 90	'''
 91	Read correlated data from a CSV-like string.
 92	
 93	Column names are interpreted in the following way:
 94	* In most cases, each columns is converted to a dict value, with the corresponding
 95	dict key being the column's label.
 96	* Columns whose label starts with `SE` are interpreted as specifying the standard
 97	error for the latest preceding data column.
 98	* Columns whose label starts with `correl` are interpreted as specifying the
 99	correlation matrix for the latest preceding data column. In that case, column labels
100	are ignored for the rest of the columns belonging to this matrix.
101	* Columns whose label starts with `covar` are interpreted as specifying the
102	covariance matrix for the latest preceding data column. In that case, column labels
103	are ignored for the rest of the columns belonging to this matrix.
104	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
105	the latest preceding data column, by adding an underscore followed by the variable's
106	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
107	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
108	underscore followed by the two variable labels, joined by a second underscore
109	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
110	correspond, respectively, to the lines and columns of this matrix.
111	* Exceptions will be raised, for any given variable:
112		- when specifying both `covar` and any combination of (`SE`, `correl`)
113		- when specifying `correl` without `SE`
114
115	**Arguments**
116	- `data`: a CSV-like string
117	- `sep`: the CSV separator
118	- `validate_covar`: whether to check that the overall covariance matrix
119	is symmetric and positive semidefinite. Specifying `validate_covar = False`
120	bypasses this computationally expensive step.
121	
122	**Example**
123	```py
124	import correldata
125	data  = """
126	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
127	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
128	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
129	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
130	"""[1:-1]
131	print(correldata.read_data(data))
132	
133	# yields:
134	# 
135	# > {
136	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
137	#     'Tacid': array([90., 90., 90.]),
138	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
139	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
140	#   }
141	```
142	'''
143
144	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
145	N = len(data) - 1
146
147	values, se, correl, covar = {}, {}, {}, {}
148	j = 0
149	while j < len(data[0]):
150		field = data[0][j]
151		if not (
152			field.startswith('SE_')
153			or field.startswith('correl_')
154			or field.startswith('covar_')
155			or field == 'SE'
156			or field == 'correl'
157			or field == 'covar'
158			or len(field) == 0
159		):
160			values[field] = _np.array([l[j] for l in data[1:]])
161			j += 1
162			oldfield = field
163		elif field.startswith('SE_'):
164			se[field[3:]] = _np.array([l[j] for l in data[1:]])
165			j += 1
166		elif field == 'SE':
167			se[oldfield] = _np.array([l[j] for l in data[1:]])
168			j += 1
169		elif field.startswith('correl_'):
170			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
171			j += N
172		elif field == 'correl':
173			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
174			j += N
175		elif field.startswith('covar_'):
176			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
177			j += N
178		elif field == 'covar':
179			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
180			j += N
181
182	nakedvalues = {}
183	for k in [_ for _ in values]:
184		if (
185			k not in se
186			and k not in correl
187			and k not in covar
188		):
189			nakedvalues[k] = values.pop(k)
190
191	for x in values:
192		if x in covar:
193			if x in se:
194				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
195			if x in correl:
196				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
197		if x in correl:
198			if x not in se:
199				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
200
201	for x in correl:
202		if x in values:
203			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
204		else:
205			for x1 in values:
206				for x2 in values:
207					if x == f'{x1}_{x2}':
208						if x1 in se:
209							se1 = se[x1]
210						else:
211							if x1 in covar:
212								se1 = _np.diag(covar[x1])**0.5
213							else:
214								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
215						if x2 in se:
216							se2 = se[x2]
217						else:
218							if x2 in covar:
219								se2 = _np.diag(covar[x2])**0.5
220							else:
221								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
222
223						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
224
225	for x in se:
226		if x in values and x not in correl:
227			covar[x] = _np.diag(se[x]**2)
228
229	for k in [_ for _ in covar]:
230		if k not in values:
231			for j1 in values:
232				for j2 in values:
233					if k == f'{j1}_{j2}':
234						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
235
236	X = _np.array([_ for k in values for _ in values[k]])
237	CM = _np.zeros((X.size, X.size))
238	for i, vi in enumerate(values):
239		for j, vj in enumerate(values):
240			if vi == vj:
241				if vi in covar:
242					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
243			else:
244				if f'{vi}_{vj}' in covar:
245					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
246
247	s = _np.diag(CM)**.5
248	s[s==0] = 1.
249	invs = _np.diag(s**-1)
250
251	if (
252		validate_covar
253		and not (
254			is_symmetric_positive_semidefinite(CM)
255			or is_symmetric_positive_semidefinite(invs @ CM @ invs)
256		)
257	):
258		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
259
260	corvalues = uarray(_uc.correlated_values(X, CM))
261
262	allvalues = nakedvalues
263
264	for i, x in enumerate(values):
265		allvalues[x] = corvalues[i*N:i*N+N]
266
267	return allvalues

Read correlated data from a CSV-like string.

Column names are interpreted in the following way:

  • In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
  • Columns whose label starts with SE are interpreted as specifying the standard error for the latest preceding data column.
  • Columns whose label starts with correl are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • Columns whose label starts with covar are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • SE, correl, and covar may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex: SE_foo, correl_bar, covar_baz).
  • correl, and covar may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex: correl_foo_bar, covar_X_Y). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.
  • Exceptions will be raised, for any given variable:
    • when specifying both covar and any combination of (SE, correl)
    • when specifying correl without SE

Arguments

  • data: a CSV-like string
  • sep: the CSV separator
  • validate_covar: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifying validate_covar = False bypasses this computationally expensive step.

Example

import correldata
data  = """
Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
"""[1:-1]
print(read_data(data))

# yields:
# 
# > {
#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
#     'Tacid': array([90., 90., 90.]),
#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
#   }
def read_data_from_file(filename: str | os.PathLike, **kwargs):
270def read_data_from_file(filename: str | _os.PathLike, **kwargs):
271	'''
272	Read correlated data from a CSV file.
273
274	**Arguments**
275	- `filename`: `str` or path to the file to read from
276	- `kwargs`: passed to correldata.read_data()
277	'''
278	with open(filename) as fid:
279		return read_data(fid.read(), **kwargs)

Read correlated data from a CSV file.

Arguments

  • filename: str or path to the file to read from
  • kwargs: passed to read_data()
def f2s( x: Any, f: Union[str, Callable, dict], k: Hashable = None, fb: Union[str, Callable] = 'z.6g') -> str:
282def f2s(
283	x: Any,
284	f: (str | Callable | dict),
285	k: Hashable = None,
286	fb: (str | Callable) = 'z.6g',
287) -> str:
288	'''
289	Format `x` according to format `f`
290	
291	* If `f` is a string, return `f'{x:{f}}'`
292	* If `f` is a callable, return `f(x)`
293	* If `f` is a dict and optional argument `k` is a hashable,
294	  return f2s(x, f[k]), otherwise return f2s(x, fb)
295	'''
296
297	if isinstance (x, str):
298		return x
299	if isinstance (f, str):
300		return f'{x:{f}}'
301	if isinstance (f, Callable):
302		return f(x)
303	if isinstance (f, dict):
304		if k in f:
305			return f2s(x, f[k])
306		if isinstance (fb, str):
307			return f'{x:{fb}}'
308		if isinstance (fb, Callable):
309			return fb(x)
310	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')

Format x according to format f

  • If f is a string, return f'{x:{f}}'
  • If f is a callable, return f(x)
  • If f is a dict and optional argument k is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
def data_string( data: dict, sep: str = ',', include_fields: list = None, exclude_fields: list = [], float_format: Union[str, dict, Callable] = 'z.6g', correl_format: Union[str, dict, Callable] = 'z.6f', default_float_format: Union[str, Callable] = 'z.6g', default_correl_format: Union[str, Callable] = 'z.6f', align: str = '>', atol: float = 1e-12, rtol: float = 1e-12):
314def data_string(
315	data: dict,
316	sep: str = ',',
317	include_fields: list = None,
318	exclude_fields: list = [],
319	float_format: (str | dict | Callable) = 'z.6g',
320	correl_format: (str | dict | Callable) = 'z.6f',
321	default_float_format: (str | Callable) = 'z.6g',
322	default_correl_format: (str | Callable) = 'z.6f',
323	align: str = '>',
324	atol: float = 1e-12,
325	rtol: float = 1e-12,
326):
327	'''
328	Generate CSV-like string from correlated data
329
330	**Arguments**
331	- `data`: dict of arrays with strings, floats or correlated data
332	- `sep`: the CSV separator
333	- `include_fields`: subset of fields to write; if `None`, write all fields
334	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
335	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
336	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
337	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
338	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
339	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
340	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
341	  missing from `float_format.keys()` will use `default_float_format` instead.
342	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
343	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
344	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
345	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
346	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
347	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
348	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
349	
350	
351	**Example**
352	
353	```py
354	from correldata import _uc
355	from correldata import _np
356	from correldata import *
357	
358	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
359	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
360	
361	data = dict(X=X, Y=Y, Z=X+Y)
362	
363	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
364	
365	# yields:
366	# 
367	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
368	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
369	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
370	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
371	```
372	'''
373	if include_fields is None:
374		include_fields = [_ for _ in data]
375	cols, ufields = [], []
376	for f in include_fields:
377		if f in exclude_fields:
378			continue
379		if isinstance(data[f], uarray):
380			ufields.append(f)
381			N = data[f].size
382			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
383			if f'SE_{f}' not in exclude_fields:
384				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
385			if f'correl_{f}' not in exclude_fields:
386				CM = _uc.correlation_matrix(data[f])
387				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
388					for i in range(N):
389						cols.append(
390							['' if i else f'correl_{f}']
391							+ [
392								f2s(
393									CM[i,j],
394									correl_format,
395									f,
396									default_correl_format,
397								)
398								for j in range(N)
399							]
400						)
401
402		else:
403			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
404
405	for i in range(len(ufields)):
406		for j in range(i):
407			if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
408				continue
409			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
410			if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
411				for k in range(N):
412					cols.append(
413						['' if k else f'correl_{ufields[j]}_{ufields[i]}']
414						+ [
415							f2s(
416								CM[k,l],
417								correl_format,
418								f,
419								default_correl_format,
420							)
421							for l in range(N)
422						]
423					)
424
425	lines = list(map(list, zip(*cols)))
426
427	if align:
428		lengths = [max([len(e) for e in l]) for l in cols]
429		for l in lines:
430			for k,ln in enumerate(lengths):
431				l[k] = f'{l[k]:{align}{ln}s}'
432		return '\n'.join([(sep+' ').join(l) for l in lines])
433
434	return '\n'.join([sep.join(l) for l in lines])

Generate CSV-like string from correlated data

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • sep: the CSV separator
  • include_fields: subset of fields to write; if None, write all fields
  • exclude_fields: subset of fields to ignore (takes precedence over include_fields); to exclude only the SE for field foo, include SE_foo; same goes for correl_foo
  • float_format: formatting for float values. May be a string (ex: 'z.3f'), a callable (ex: lambda x: '.2f' if x else '0'), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex: {'foo': '.2e', 'bar': (lambda x: str(x))}).
  • correl_format: same as float_format, but applies to correlation matrix elements
  • default_float_format: only used when float_format is a dict; in that case, fields missing from float_format.keys() will use default_float_format instead. corresponding to different fields (ex: {'foo': '.2e', 'bar':lambda x: str(x)}).
  • default_correl_format: same as default_float_format, but applies to correl_format
  • align: right-align (>), left-align (<), or don't align (empty string) CSV values
  • atol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
  • rtol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix

Example

from correldata import _uc
from correldata import _np
from correldata import *

X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))

data = dict(X=X, Y=Y, Z=X+Y)

print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))

# yields:
# 
#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
def save_data_to_file(data, filename, **kwargs):
438def save_data_to_file(data, filename, **kwargs):
439	'''
440	Write correlated data to a CSV file.
441
442	**Arguments**
443	- `data`: dict of arrays with strings, floats or correlated data
444	- `filename`: `str` or path to the file to read from
445	- `kwargs`: passed to correldata.data_string()
446	'''
447	with open(filename, 'w') as fid:
448		return fid.write(data_string(data, **kwargs))

Write correlated data to a CSV file.

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • filename: str or path to the file to read from
  • kwargs: passed to data_string()