Create a function that reads in data from attached USGS streamflow files at Maine stations

Count the number of data points in each file

Analyze the Data

In [2]:
import os
from dateutil import parser
import matplotlib.pyplot as pl
import numpy as np

%matplotlib inline
In [3]:
def readUSGS(directory,filename):
    from numpy import array
    fullname='{0}/{1}'.format(directory,filename)
    fileobj=open(fullname)
    data=[]
    for line in fileobj:
        words=line.split('\t')
        if words[0]=='agency_cd':
            idx=[i for i in range(len(words)) if ('60' in words[i])]
            if (len(idx)!=0) and (idx[0]!=3):
                print('discharge in column {0} in {1}'.format(idx,filename))
        elif (words[0]=='USGS') and (len(idx)!=0):
            date=parser.parse(words[2])
            site_no=float(filename[:-3])
            try:
                discharge=float(words[idx[0]])
            except ValueError:
                discharge=-9999.
                #print(words[3])
            data.append([site_no,date,discharge])
    fileobj.close()
    return array(data)
In [12]:
filelist=os.listdir('data')
filelist.sort()

data=[]
stack=None
for filename in filelist:
    new_data=readUSGS('data',filename)
    data.append(new_data)
    if len(new_data)>0:
        #print(new_data)
        try:
            stack=np.vstack((stack,new_data))
        except NameError:
            stack=new_data.copy()
        except ValueError:
            stack=new_data.copy()
counts=[i.shape[0] for i in data]
pl.hist(counts)

complete_records=[i for i in range(len(counts)) if counts[i]==3640]

print('Number of complete records = {0}'.format(len(complete_records)))

count=-1
plotrows=2

for i in complete_records[:10]:
    count+=1
    plotnum=count%plotrows
    if plotnum==0:
        fig=pl.figure()
    sp=fig.add_subplot(plotrows,1,plotnum+1)
    sp.plot(data[i][:,1],data[i][:,2])
    #hide tick labels in top subplot, to title shows up better
    if plotnum==0:
        for lb in sp.get_xticklabels():
            lb.set_visible(False)
    
    pl.title(filelist[i],color='red')
    pl.ylim(ymin=0)
pl.show()
discharge in column [15, 16] in 01010000.txt
discharge in column [15, 16] in 01011000.txt
discharge in column [15, 16] in 01018035.txt
discharge in column [15, 16] in 01022500.txt
discharge in column [9, 10] in 010228955.txt
discharge in column [15, 16] in 01031500.txt
discharge in column [11, 12] in 01036390.txt
discharge in column [15, 16] in 01037380.txt
discharge in column [15, 16] in 01038000.txt
discharge in column [15, 16] in 01054200.txt
discharge in column [5, 6] in 01063100.txt
Number of complete records = 23

In [5]:
[filelist[i] for i in complete_records[:10]]
Out[5]:
['01010500.txt',
 '01014000.txt',
 '01015800.txt',
 '01017000.txt',
 '01017960.txt',
 '01018000.txt',
 '01021000.txt',
 '01022500.txt',
 '01029500.txt',
 '01030500.txt']
In [6]:
np.where(max(stack[:,2])==stack[:,2])
Out[6]:
(array([19842, 19843]),)
In [7]:
stack[1645:1649]
Out[7]:
array([[1010000.0, datetime.datetime(2008, 4, 29, 0, 0), 29800.0],
       [1010000.0, datetime.datetime(2008, 4, 30, 0, 0), 42300.0],
       [1010000.0, datetime.datetime(2008, 5, 1, 0, 0), 34200.0],
       [1010000.0, datetime.datetime(2008, 5, 2, 0, 0), 23200.0]], dtype=object)
In [7]: