In python, I have the following data in a list of namedtuple in memory:
JavaScript
x
24
24
1
from collections import namedtuple
2
perfvalues=[]
3
perfitem = namedtuple('perfitem', 'cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write')
4
item1=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk1', 650000, 500000, 1.2, 0.9)
5
item2=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk2', 630000, 480000, 1.1, 1)
6
item3=perfitem('cluster1', 'host1', 'database1', 'dg_data2', 'disk1', 730000, 250000, 0.4, 0.7)
7
item4=perfitem('cluster1', 'host1', 'database2', 'dg_data1', 'disk1', 320000, 400000, 1, 0.4)
8
item5=perfitem('cluster1', 'host1', 'database2', 'dg_data2', 'disk1', 550000, 300000, 0.8, 0.8)
9
item6=perfitem('cluster1', 'host2', 'database3', 'dg_data1', 'disk1', 420000, 310000, 1.2, 0.7)
10
item7=perfitem('cluster1', 'host2', 'database3', 'dg_data2', 'disk1', 880000, 280000, 0.4, 0.6)
11
item8=perfitem('cluster1', 'host2', 'database4', 'dg_data5', 'disk1', 440000, 600000, 1, 0.5)
12
item9=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk1', 490000, 450000, 1, 0.4)
13
item10=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk2', 410000, 450000, 1, 0.4)
14
perfvalues.append(item1)
15
perfvalues.append(item2)
16
perfvalues.append(item3)
17
perfvalues.append(item4)
18
perfvalues.append(item5)
19
perfvalues.append(item6)
20
perfvalues.append(item7)
21
perfvalues.append(item8)
22
perfvalues.append(item9)
23
print(perfvalues)
24
I want to group the data by :
- cluster
- cluster and host
- cluster and host and database
- cluster and host and database and diskgroup
I won’t need the disk details.
In each group I want to :
- sum the values of read_bytes_per_sec and write_bytes_per_sec
- compute the average of the values of avg_ms_per_read and avg_ms_per_write
As a result I want a list containing :
JavaScript
1
16
16
1
group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
2
group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
3
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
4
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
5
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
6
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
7
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data5', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
8
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
9
group='per_database', cluster='cluster1', host='host1', db='database1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
10
group='per_database', cluster='cluster1', host='host1', db='database2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
11
group='per_database', cluster='cluster1', host='host2', db='database3', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
12
group='per_database', cluster='cluster1', host='host2', db='database4', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
13
group='per_host', cluster='cluster1', host='host1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
14
group='per_host', cluster='cluster1', host='host2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
15
group='per_cluster', cluster='cluster1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
16
result may be a list of dict or list of namedtuple I guess. the order in the final list does not matters. I’m using python 3.8.
Any Ideas? Thanks
Advertisement
Answer
These are pandas data frames, if you need python dictionary version you’ll get it by adding .to_dict()
to per_diskgroup
, per_database
, per_host
, per_cluster
. eg: per_database.to_dict()
JavaScript
1
72
72
1
In [2]: l = ["cluster1,host1,database1,dg_data1,disk1,650000,500000,1.2,0.9",
2
"cluster1,host1,database1,dg_data1,disk2,630000,480000,1.1,1", :
3
"cluster1,host1,database1,dg_data2,disk1,730000,250000,0.4,0.7", :
4
"cluster1,host1,database2,dg_data1,disk1,320000,400000,1,0.4", :
5
"cluster1,host1,database2,dg_data2,disk1,550000,300000,0.8,0.8", :
6
"cluster1,host2,database3,dg_data1,disk1,420000,310000,1.2,0.7", :
7
"cluster1,host2,database3,dg_data2,disk1,880000,280000,0.4,0.6", :
8
"cluster1,host2,database4,dg_data5,disk1,440000,600000,1,0.5", :
9
"cluster1,host2,database4,dg_data2,disk1,490000,450000,1,0.4", :
10
"cluster1,host2,database4,dg_data2,disk2,410000,450000,1,0.4"] :
11
l = [i.split(",") for i in l] :
12
df = pd.DataFrame(l, columns="cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write :
13
".split()) :
14
cols = ['read_bytes_per_sec', 'write_bytes_per_sec', 'avg_ms_per_read','avg_ms_per_write'] :
15
df[cols] = df[cols].astype(float) :
16
df :
17
Out[2]:
18
cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write
19
0 cluster1 host1 database1 dg_data1 disk1 650000.0 500000.0 1.2 0.9
20
1 cluster1 host1 database1 dg_data1 disk2 630000.0 480000.0 1.1 1.0
21
2 cluster1 host1 database1 dg_data2 disk1 730000.0 250000.0 0.4 0.7
22
3 cluster1 host1 database2 dg_data1 disk1 320000.0 400000.0 1.0 0.4
23
4 cluster1 host1 database2 dg_data2 disk1 550000.0 300000.0 0.8 0.8
24
5 cluster1 host2 database3 dg_data1 disk1 420000.0 310000.0 1.2 0.7
25
6 cluster1 host2 database3 dg_data2 disk1 880000.0 280000.0 0.4 0.6
26
7 cluster1 host2 database4 dg_data5 disk1 440000.0 600000.0 1.0 0.5
27
8 cluster1 host2 database4 dg_data2 disk1 490000.0 450000.0 1.0 0.4
28
9 cluster1 host2 database4 dg_data2 disk2 410000.0 450000.0 1.0 0.4
29
30
In [3]: aggregation = {"read_bytes_per_sec" : sum, "avg_ms_per_read" : np.mean, "avg_ms_per_write" : np.mean}
31
per_diskgroup = df.groupby(['disk', 'cluster', 'host', 'database', 'diskgroup']).agg(aggregation) :
32
per_database = df.groupby (['cluster', 'host', 'database']).agg(aggregation) :
33
per_host = df.groupby (['host', 'cluster']).agg(aggregation) :
34
per_cluster = df.groupby (['cluster']).agg(aggregation) :
35
36
In [4]: per_diskgroup
37
Out[4]:
38
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
39
disk cluster host database diskgroup
40
disk1 cluster1 host1 database1 dg_data1 650000.0 1.2 0.9
41
dg_data2 730000.0 0.4 0.7
42
database2 dg_data1 320000.0 1.0 0.4
43
dg_data2 550000.0 0.8 0.8
44
host2 database3 dg_data1 420000.0 1.2 0.7
45
dg_data2 880000.0 0.4 0.6
46
database4 dg_data2 490000.0 1.0 0.4
47
dg_data5 440000.0 1.0 0.5
48
disk2 cluster1 host1 database1 dg_data1 630000.0 1.1 1.0
49
host2 database4 dg_data2 410000.0 1.0 0.4
50
51
In [5]: per_database
52
Out[5]:
53
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
54
cluster host database
55
cluster1 host1 database1 2010000.0 0.9 0.866667
56
database2 870000.0 0.9 0.600000
57
host2 database3 1300000.0 0.8 0.650000
58
database4 1340000.0 1.0 0.433333
59
60
In [6]: per_host
61
Out[6]:
62
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
63
host cluster
64
host1 cluster1 2880000.0 0.90 0.76
65
host2 cluster1 2640000.0 0.92 0.52
66
67
In [7]: per_cluster
68
Out[7]:
69
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
70
cluster
71
cluster1 5520000.0 0.91 0.64
72