import matplotlib.pyplot as plt import numpy as np plt.close('all') # make up some dummy data to test # don't worry about this part ndum=1000 x = 10*np.random.random(ndum) y = x + np.random.normal(0,3,ndum) # and lets put in some bad data values at y=-10 derp=np.random.random(ndum) y=np.array([-10 if a<0.1 else b for a,b in zip(derp,y)]) # start paying attention here! # Let's bin up the data in x and look at average y value and # standard deviation in each bin. # We'll use the binned_statistic functiuon from scipy.stats # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binned_statistic.html from scipy.stats import binned_statistic # define good data good = (y!=-10) # now work out the average and standard deviation of the y values in 10 bins of width 1 binvals=np.arange(0,11,1) bin_avg, bin_edges, bin_num = binned_statistic(x[good], y[good], statistic='mean', bins=binvals) bin_std, bin_edges, bin_num = binned_statistic(x[good], y[good], statistic='std', bins=binvals) # this bit of magic works out the centers of the bins bin_cent = 0.5*(bin_edges[1:]+bin_edges[:-1]) # plot the data plt.scatter(x,y,s=5) plt.xlabel('x') plt.ylabel('y') # plot the mean and standard deviation as red points and errorbars plt.errorbar(bin_cent, bin_avg, yerr=bin_std, fmt='o', color='red') # make a linear fit, and calculate uncertainty and scatter coeff,cov=np.polyfit(x[good],y[good],1,cov=True) coeff_err = np.sqrt(np.diag(cov)) print(' slope = {:.3f} +/- {:.3f}'.format(coeff[0],coeff_err[0])) print('intercept = {:.3f} +/- {:.3f}'.format(coeff[1],coeff_err[1])) xfit=np.linspace(0,10,100) polynomial=np.poly1d(coeff) plt.plot(xfit,polynomial(xfit),color='green',lw=3) print(' scatter = {:.3f}'.format(np.std(y[good]-polynomial(x[good]))))