import matplotlib.pyplot as plt import numpy as np plt.close('all') # make up some dummy data to test # don't worry about this part ndum=1000 x = 10*np.random.random(ndum) y = x + np.random.normal(0,3,ndum) # and lets put in some bad data values at y=-10 derp=np.random.random(ndum) y=np.array([-10 if a<0.1 else b for a,b in zip(derp,y)]) # start paying attention here! # Let's bin up the data in x and look at average y value in each bin. # make 10 bins in x, each with width 1 nbin=10 binwidth=1 # calculate which bin each data point is in inbin=np.trunc(x/binwidth) # set up a variables to track binned quantities, # in this case the average of y and the number # of data points in each bin bin_avgval=np.zeros(nbin) bin_ndata=np.zeros(nbin) # also set up a variable to hold the x center # of the bin bin_xcent = np.zeros(nbin) # now, loop over each bin, calculating values for i in range(0,nbin): want = (inbin==i) & (y!=-10) # we only want data that's in the bin and good bin_ndata[i]=len(y[want]) bin_avgval[i]=np.average(y[want]) bin_xcent[i]=(i+0.5)*binwidth plt.scatter(x,y) plt.scatter(bin_xcent,bin_avgval,color='red',s=40) # make a linear fit, and calculate uncertainty and scatter good=(y!=-10) # dont want to include bad data coeff,cov=np.polyfit(x[good],y[good],1,cov=True) coeff_err = np.sqrt(np.diag(cov)) print(' slope = {:.3f} +/- {:.3f}'.format(coeff[0],coeff_err[0])) print('intercept = {:.3f} +/- {:.3f}'.format(coeff[1],coeff_err[1])) xfit=np.linspace(0,10,100) polynomial=np.poly1d(coeff) plt.plot(xfit,polynomial(xfit),color='green',lw=3) print(' scatter = {:.3f}'.format(np.std(y[good]-polynomial(x[good]))))