import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import os
inj_datadir = 'c:\MyDocs/sandbox/data/datasets/FracFocus/KS_injection_wells/'
outdir = 'c:\MyDocs/sandbox/data/datasets/FracFocus/output/'
Quantity reported is "FLUID_INJECTED" from the original Kansas data files. There are ~500 "GAS_INJECTED" records that are not incorportated here....
def fetch_raw_KS_injection(indir=inj_datadir):
# it is assumed that all files in the directory are
# to be processed.
flist = os.listdir(indir)
if len(flist)<1 :
raise Exception('No files to process!')
for f in flist:
if f[-4:] != '.txt':
raise Exception('Some non-txt files present in input directory')
dlst = []
for fn in flist:
print(f'Processing: {fn}')
d = pd.read_csv(indir+fn,low_memory=False)
d = d.filter(['API','KGS_ID','YEAR','MONTH','FLUID_INJECTED']) # drop most columns
dlst.append(d)
print(f' length of data from this file: {len(d)}')
df_inj = pd.concat(dlst,sort=True)
print(f'Total length of dataframe: {len(df_inj)}')
return df_inj
d = fetch_raw_KS_injection()
d.info()
Create string to label the period
d['YearMo'] = 'Y' + d.YEAR.astype('str').str[:] + 'M' + d.MONTH.astype('str').str.zfill(2)[:]
d.head()
print(len(d.KGS_ID.unique()))
print(len(d.API.unique()))
NOTE: There are more KGS_IDs than API numbers. So some wells are labeled with 1 API# but more than one KGS_ID. When compiling these data, we will keep only the first KGS_ID.
kgs = pd.DataFrame(d.groupby(['API'],as_index=False)['KGS_ID'].first())
kgs.head()
Although in the groupby below, there should only be a single number (one measure for any Year/Mo combination), some wells apparently have more than a single record:
test = d.groupby(['YearMo','API'],as_index=False)['FLUID_INJECTED'].count()
test.FLUID_INJECTED.unique()
So we will treat them as partial measures and add all records for a given period.
gb = d.groupby(['API','YearMo'],as_index=False)['FLUID_INJECTED'].sum()
gb.head()
Pivot =gb.pivot(index='API',columns='YearMo')
Pivot.columns = Pivot.columns.get_level_values(1) # snags YearMo as the column name
Pivot.head()
Get the KGS_ID back in there...
mg = pd.merge(kgs,Pivot,on='API',validate='1:m',how='right')
mg.info()
last = d.YearMo.max()
fn = outdir+'KS_Injection_thru_'+last+'.csv'
mg.to_csv(fn)
print(fn)
t = pd.DataFrame(d.groupby('API')['FLUID_INJECTED'].sum())
t[t.FLUID_INJECTED==0].count()
old = pd.read_csv(outdir+'KansasInject_old_from_Ted.csv')