-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_fake_pandas_df.py
56 lines (51 loc) · 3.47 KB
/
generate_fake_pandas_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
https://zhuanlan.zhihu.com/p/394857784
import pandas as pd
import numpy as np
from itertools import cycle
def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None):
categories_dict = {'animals': ['cow', 'rabbit', 'duck', 'shrimp', 'pig', 'goat', 'crab', 'deer', 'bee', 'sheep', 'fish', 'turkey', 'dove', 'chicken', 'horse'],
'names' : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'],
'cities' : ['Stockholm', 'Denver', 'Moscow', 'Marseille', 'Palermo', 'Tokyo', 'Lisbon', 'Oslo', 'Nairobi', 'Río de Janeiro', 'Berlin', 'Bogotá', 'Manila', 'Madrid', 'Milwaukee'],
'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white']
}
default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("names", 5), "d" : ("2020-01-01","2020-12-31")}
rng = np.random.default_rng(seed)
first_c = default_intervals["c"][0]
categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c])
default_intervals["c"] = (categories_names, default_intervals["c"][1])
if isinstance(col_names,list):
assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements"
elif col_names is None:
suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"}
col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)]
if isinstance(intervals,list):
assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements"
else:
if isinstance(intervals,dict):
assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys"
default_intervals.update(intervals)
intervals = [default_intervals[col] for col in cols]
df = pd.DataFrame()
for col, col_name, interval in zip(cols, col_names, intervals):
if interval is None:
interval = default_intervals[col]
assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings."
if col in ("i","f","d"):
start, end = interval
if col == "i":
df[col_name] = rng.integers(start, end, size)
elif col == "f":
df[col_name] = rng.uniform(start, end, size)
elif col == "c":
if isinstance(interval, list):
categories = np.array(interval)
else:
cat_family, length = interval
if isinstance(cat_family, cycle):
cat_family = next(cat_family)
assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}"
categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True)
df[col_name] = rng.choice(categories, size, shuffle = True)
elif col == "d":
df[col_name] = rng.choice(pd.date_range(start, end), size)
return df