-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf.py
82 lines (65 loc) · 2.44 KB
/
pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from typing import Optional
import numpy as np
def __make_pdf(
data: list,
bin_boundaries: np.ndarray,
) -> np.ndarray:
# We usually study distributions, which have single region of support.
# In other words, most distributions we study should have non-zero
# density in (start, stop) interval.
#
# For this reason we set bins manually, calculate histogram with
# density=False setting, delete empty bins, and finally calculate the
# density manually by dividing counts by the width of non-empty bins.
histogram = np.histogram(data, bins=bin_boundaries, density=False)[0]
empty_bin_pos = np.where(histogram == 0)
# For simplicity sake, the bin to the left of non-empty bin is extended.
histogram = np.delete(histogram, empty_bin_pos)
bin_boundaries = np.delete(bin_boundaries, empty_bin_pos)
bin_widths = np.diff(bin_boundaries)
density = histogram / np.sum(histogram) / bin_widths
# The reported bin location is centered between the boundaries
centroids = 0.5 * (bin_boundaries[1:] + bin_boundaries[:-1])
pdf = np.array([centroids, density]).T
return pdf
def make_pdf(
data: list,
start: Optional[float] = None,
stop: Optional[float] = None,
out_points: int = 100,
) -> np.ndarray:
"""Extract empirical PDF on lin-lin scale."""
if start is None:
start = np.min(data)
if stop is None:
stop = np.max(data)
bin_boundaries = np.linspace(start, stop, num=out_points)
return __make_pdf(data, bin_boundaries)
def make_log_pdf(
data: list,
start: Optional[float] = None,
stop: Optional[float] = None,
out_points: int = 100,
) -> np.ndarray:
"""Extract empirical PDF on log-log scale."""
if start is None:
_start = np.min(data)
else:
_start = start
_start = np.log10(_start)
if stop is None:
_stop = np.max(data)
else:
_stop = stop
_stop = np.log10(_stop)
bin_boundaries = np.logspace(_start, _stop, num=out_points)
return __make_pdf(data, bin_boundaries)
def estimate_cdf_from_pdf(pdf: np.ndarray) -> np.ndarray:
"""Approximate empirical CDF from empirical PDF."""
bin_widths = np.diff(pdf[:, 0])
left_pdf_sum = np.cumsum(pdf[1:, 1] * bin_widths)
right_pdf_sum = np.cumsum(pdf[:-1, 1] * bin_widths)
cdf = 0.5 * left_pdf_sum + 0.5 * right_pdf_sum
cdf = cdf / cdf[-1]
cdf = np.concatenate(([0], cdf))
return np.transpose([pdf[:, 0], cdf])