问题

考虑这个df:

 sample_df =  pd.DataFrame({'begin': {0: pd.Timestamp('1999-11-18 00:00:00'), 1: pd.Timestamp('2016-11-01 00:00:00'), 2: pd.Timestamp('2014-10-02 00:00:00'), 3: pd.Timestamp('1987-05-07 00:00:00'), 4: pd.Timestamp('2005-09-27 00:00:00'), 5: pd.Timestamp('2012-12-13 00:00:00'), 6: pd.Timestamp('1986-01-01 00:00:00'), 7: pd.Timestamp('1986-01-01 00:00:00'), 8: pd.Timestamp('2013-09-26 00:00:00'), 9: pd.Timestamp('1992-12-16 00:00:00'), 10: pd.Timestamp('2001-11-29 00:00:00'), 11: pd.Timestamp('1986-01-01 00:00:00'), 12: pd.Timestamp('2011-01-13 00:00:00'), 13: pd.Timestamp('2005-12-20 00:00:00'), 14: pd.Timestamp('2005-11-09 00:00:00'), 15: pd.Timestamp('2001-06-19 00:00:00'), 16: pd.Timestamp('1988-04-15 00:00:00'), 17: pd.Timestamp('2001-04-06 00:00:00'), 18: pd.Timestamp('2013-01-02 00:00:00'), 19: pd.Timestamp('1995-04-04 00:00:00')}, 'end': {0: pd.Timestamp('2019-09-20 00:00:00'), 1: pd.Timestamp('2019-09-20 00:00:00'), 2: pd.Timestamp('2019-09-20 00:00:00'), 3: pd.Timestamp('2019-09-20 00:00:00'), 4: pd.Timestamp('2019-09-20 00:00:00'), 5: pd.Timestamp('2019-09-20 00:00:00'), 6: pd.Timestamp('2019-09-20 00:00:00'), 7: pd.Timestamp('2019-09-20 00:00:00'), 8: pd.Timestamp('2019-09-20 00:00:00'), 9: pd.Timestamp('2019-09-20 00:00:00'), 10: pd.Timestamp('2019-09-20 00:00:00'), 11: pd.Timestamp('2019-09-20 00:00:00'), 12: pd.Timestamp('2019-09-20 00:00:00'), 13: pd.Timestamp('2019-09-20 00:00:00'), 14: pd.Timestamp('2019-09-20 00:00:00'), 15: pd.Timestamp('2019-09-20 00:00:00'), 16: pd.Timestamp('2019-09-20 00:00:00'), 17: pd.Timestamp('2019-09-20 00:00:00'), 18: pd.Timestamp('2019-09-20 00:00:00'), 19: pd.Timestamp('2019-09-20 00:00:00')}})
 

现在考虑这10个日期:

 date_series = pd.Series({15644: pd.Timestamp('2009-09-17 00:00:00'), 15645: pd.Timestamp('2016-09-18 00:00:00'), 15646: pd.Timestamp('2013-09-19 00:00:00'), 15647: pd.Timestamp('2011-09-20 00:00:00'), 15648: pd.Timestamp('2013-09-23 00:00:00'), 15649: pd.Timestamp('2012-09-24 00:00:00'), 15650: pd.Timestamp('2016-09-25 00:00:00'), 15651: pd.Timestamp('2014-09-26 00:00:00'), 15652: pd.Timestamp('2012-09-27 00:00:00'), 15653: pd.Timestamp('2013-09-30 00:00:00')})
 

我正在寻找pandas中获得相同结果的最快方法:

 out = [sample_df.index[(pd.Series(x >= sample_df.begin) & pd.Series(x <= sample_df.end))] for x in date_series]
 

时间建议(大得多的sample_dfdate_series):

天真的解决方案:

 from timeit import default_timer as timer
from datetime import timedelta
start = timer() 
out = [this_source.index[(pandas.Series(x >= this_source.FTD) & pandas.Series(x <= this_source.LTD))] for x in stash_index] 
end = timer() 
print(timedelta(seconds=end-start)) 
 

0:00:13.23307

@Quang Hoang

 start = timer()
mask  = ((this_source.begin.values<=stash_index[:,None]) & (stash_index[:,None] <= this_source.end.values))
res   = list(np.where(mask, this_source.index.values[None,:], np.nan))
res   = [list(x[~numpy.isnan(x)]) for x in res]
end = timer()
print(timedelta(seconds=end-start))
 

0:00:04.035035

  最佳答案

这可以通过广播实现:

 mask = ((sample_df.begin.values<=date_series.values[:,None]) & 
           (date_series.values[:,None]<= sample_df.end.values))

np.where(mask, sample_df.index.values[None,:], np.nan)
 

输出:

 array([[ 0., nan,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.],
       [ 0., nan, nan,  3.,  4., nan,  6.,  7., nan,  9.],
       [ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.],
       [ 0., nan, nan,  3.,  4., nan,  6.,  7., nan,  9.],
       [nan, nan, nan,  3., nan, nan,  6.,  7., nan,  9.],
       [nan, nan, nan,  3., nan, nan,  6.,  7., nan,  9.],
       [ 0., nan, nan,  3., nan, nan,  6.,  7., nan,  9.],
       [ 0., nan, nan,  3.,  4., nan,  6.,  7., nan,  9.],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]])
 

  相同标签的其他问题

pythonpandas