import matplotlib.pyplot as plt import numpy as np # make up some dummy data to test # don't worry about this part ndum=1000 x = 10*np.random.random(ndum) y = x + np.random.normal(0,3,ndum) # and lets put in some bad data values at y=-10 derp=np.random.random(ndum) y=np.array([-10 if a<0.1 else b for a,b in zip(derp,y)]) # start paying attention here! # Let's bin up the data in x and look at average y value in each bin. # make 10 bins in x, each with width 1 nbin=10 binwidth=1 # calculate which bin each data point is in inbin=np.trunc(x/binwidth) # set up a variables to track binned quantities, # in this case the average of y and the number # of data points in each bin bin_avgval=np.zeros(nbin) bin_ndata=np.zeros(nbin) # also set up a variable to hold the x center # of the bin bin_xcent = np.zeros(nbin) # now, loop over each bin, calculating values for i in range(0,nbin): want = (inbin==i) & (y!=-10) # we only want data that's in the bin and good bin_ndata[i]=len(y[want]) bin_avgval[i]=np.average(y[want]) bin_xcent[i]=(i+0.5)*binwidth plt.scatter(x,y) plt.scatter(bin_xcent,bin_avgval,color='red',s=40) # now lets fit a line to the raw data # fit line the simplest way #good=(y!=-10) # dont want to include bad data #m,b=np.polyfit(x[good],y[good],1) #print 'slope=',m #print 'intercept=',b #plt.plot(x,x*m+b,color='green',lw=3) # a different way to do a linear fit, # that includes calculating errors on parameters good=(y!=-10) # dont want to include bad data coeff,cov=np.polyfit(x[good],y[good],1,cov=True) coeff_err = np.sqrt(np.diag(cov)) print 'slope=',coeff[0],'+/-',coeff_err[0] print 'intercept=',coeff[1],'+/-',coeff_err[1] xfit=np.linspace(0,10,100) polynomial=np.poly1d(coeff) plt.plot(xfit,polynomial(xfit),color='green',lw=3) print 'scatter=',np.std(y[good]-polynomial(x[good]))