I am trying to concat the following dataframes:
df1
JavaScript
x
7
1
price side timestamp
2
timestamp
3
2016-01-04 00:01:15.631331072 0.7286 2 1451865675631331
4
2016-01-04 00:01:15.631399936 0.7286 2 1451865675631400
5
2016-01-04 00:01:15.631860992 0.7286 2 1451865675631861
6
2016-01-04 00:01:15.631866112 0.7286 2 1451865675631866
7
and:
df2
JavaScript
1
8
1
bid bid_size offer offer_size
2
timestamp
3
2016-01-04 00:00:31.331441920 0.7284 4000000 0.7285 1000000
4
2016-01-04 00:00:53.631324928 0.7284 4000000 0.7290 4000000
5
2016-01-04 00:01:03.131234048 0.7284 5000000 0.7286 4000000
6
2016-01-04 00:01:12.131444992 0.7285 1000000 0.7286 4000000
7
2016-01-04 00:01:15.631364096 0.7285 4000000 0.7290 4000000
8
With
JavaScript
1
2
1
data = pd.concat([df1,df2], axis=1)
2
But I get the follwing output:
JavaScript
1
73
73
1
InvalidIndexError Traceback (most recent call last)
2
<ipython-input-38-2e88458f01d7> in <module>()
3
----> 1 data = pd.concat([df1,df2], axis=1)
4
2 data = data.fillna(method='pad')
5
3 data = data.fillna(method='bfill')
6
4 data['timestamp'] = data.index.values#converting to datetime
7
5 data['timestamp'] = pd.to_datetime(data['timestamp'])#converting to datetime
8
9
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
10
810 keys=keys, levels=levels, names=names,
11
811 verify_integrity=verify_integrity,
12
--> 812 copy=copy)
13
813 return op.get_result()
14
814
15
16
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
17
947 self.copy = copy
18
948
19
--> 949 self.new_axes = self._get_new_axes()
20
950
21
951 def get_result(self):
22
23
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
24
1013 if i == self.axis:
25
1014 continue
26
-> 1015 new_axes[i] = self._get_comb_axis(i)
27
1016 else:
28
1017 if len(self.join_axes) != ndim - 1:
29
30
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
31
1039 raise TypeError("Cannot concatenate list of %s" % types)
32
1040
33
-> 1041 return _get_combined_index(all_indexes, intersect=self.intersect)
34
1042
35
1043 def _get_concat_axis(self):
36
37
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
38
6120 index = index.intersection(other)
39
6121 return index
40
-> 6122 union = _union_indexes(indexes)
41
6123 return _ensure_index(union)
42
6124
43
44
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
45
6149
46
6150 if hasattr(result, 'union_many'):
47
-> 6151 return result.union_many(indexes[1:])
48
6152 else:
49
6153 for other in indexes[1:]:
50
51
/usr/local/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
52
959 else:
53
960 tz = this.tz
54
--> 961 this = Index.union(this, other)
55
962 if isinstance(this, DatetimeIndex):
56
963 this.tz = tz
57
58
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
59
1553 result.extend([x for x in other._values if x not in value_set])
60
1554 else:
61
-> 1555 indexer = self.get_indexer(other)
62
1556 indexer, = (indexer == -1).nonzero()
63
1557
64
65
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit, tolerance)
66
1890
67
1891 if not self.is_unique:
68
-> 1892 raise InvalidIndexError('Reindexing only valid with uniquely'
69
1893 ' valued Index objects')
70
1894
71
72
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
73
I have removed additional columns and removed duplicates and NA where there could be a conflict – but I simply do not know what’s wrong.
Advertisement
Answer
pd.concat
requires that the indices be unique. To remove rows with duplicate indices, use
JavaScript
1
2
1
df = df.loc[~df.index.duplicated(keep='first')]
2
JavaScript
1
45
45
1
import pandas as pd
2
from pandas import Timestamp
3
4
df1 = pd.DataFrame(
5
{'price': [0.7286, 0.7286, 0.7286, 0.7286],
6
'side': [2, 2, 2, 2],
7
'timestamp': [1451865675631331, 1451865675631400,
8
1451865675631861, 1451865675631866]},
9
index=pd.DatetimeIndex(['2000-1-1', '2000-1-1', '2001-1-1', '2002-1-1']))
10
11
12
df2 = pd.DataFrame(
13
{'bid': [0.7284, 0.7284, 0.7284, 0.7285, 0.7285],
14
'bid_size': [4000000, 4000000, 5000000, 1000000, 4000000],
15
'offer': [0.7285, 0.729, 0.7286, 0.7286, 0.729],
16
'offer_size': [1000000, 4000000, 4000000, 4000000, 4000000]},
17
index=pd.DatetimeIndex(['2000-1-1', '2001-1-1', '2002-1-1', '2003-1-1', '2004-1-1']))
18
19
20
df1 = df1.loc[~df1.index.duplicated(keep='first')]
21
# price side timestamp
22
# 2000-01-01 0.7286 2 1451865675631331
23
# 2001-01-01 0.7286 2 1451865675631861
24
# 2002-01-01 0.7286 2 1451865675631866
25
26
df2 = df2.loc[~df2.index.duplicated(keep='first')]
27
# bid bid_size offer offer_size
28
# 2000-01-01 0.7284 4000000 0.7285 1000000
29
# 2001-01-01 0.7284 4000000 0.7290 4000000
30
# 2002-01-01 0.7284 5000000 0.7286 4000000
31
# 2003-01-01 0.7285 1000000 0.7286 4000000
32
# 2004-01-01 0.7285 4000000 0.7290 4000000
33
34
result = pd.concat([df1, df2], axis=0)
35
print(result)
36
bid bid_size offer offer_size price side timestamp
37
2000-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
38
2001-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
39
2002-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
40
2000-01-01 0.7284 4000000 0.7285 1000000 NaN NaN NaN
41
2001-01-01 0.7284 4000000 0.7290 4000000 NaN NaN NaN
42
2002-01-01 0.7284 5000000 0.7286 4000000 NaN NaN NaN
43
2003-01-01 0.7285 1000000 0.7286 4000000 NaN NaN NaN
44
2004-01-01 0.7285 4000000 0.7290 4000000 NaN NaN NaN
45
Note there is also pd.join
, which can join DataFrames based on their indices,
and handle non-unique indices based on the how
parameter. Rows with duplicate
index are not removed.
JavaScript
1
24
24
1
In [94]: df1.join(df2)
2
Out[94]:
3
price side timestamp bid bid_size offer
4
2000-01-01 0.7286 2 1451865675631331 0.7284 4000000 0.7285
5
2000-01-01 0.7286 2 1451865675631400 0.7284 4000000 0.7285
6
2001-01-01 0.7286 2 1451865675631861 0.7284 4000000 0.7290
7
2002-01-01 0.7286 2 1451865675631866 0.7284 5000000 0.7286
8
9
offer_size
10
2000-01-01 1000000
11
2000-01-01 1000000
12
2001-01-01 4000000
13
2002-01-01 4000000
14
15
In [95]: df1.join(df2, how='outer')
16
Out[95]:
17
price side timestamp bid bid_size offer offer_size
18
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
19
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
20
2001-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7290 4000000
21
2002-01-01 0.7286 2 1.451866e+15 0.7284 5000000 0.7286 4000000
22
2003-01-01 NaN NaN NaN 0.7285 1000000 0.7286 4000000
23
2004-01-01 NaN NaN NaN 0.7285 4000000 0.7290 4000000
24