import matplotlib.pyplot as plt
import numpy as np
from numpy.random import randn
plt.plot(np.arange(10))
plt.show()
fig=plt.figure()
ax1=fig.add_subplot(2,2,1)
ax2=fig.add_subplot(2,2,2)
ax3=fig.add_subplot(2,2,3)
plt.plot(np.random.randn(50).cumsum(),'k--')
[
_=ax1.hist(np.random.randn(100),bins=20,color='k',alpha=0.3)
ax2.scatter(np.arange(30),np.arange(30)+3*np.random.randn(30))
plt.show()
fig,axes=plt.subplots(2,3)
axes
array([[
# subplots_adjust(left=None,bottom=None,right=None,top=None,wpace=None,hspace=None)
fig,axes=plt.subplots(2,2,sharex=True,sharey=True)
for i in range(2):
for j in range(2):
axes[i,j].hist(np.random.randn(500),bins=50,color='k',alpha=0.5)
plt.subplots_adjust(wspace=0,hspace=0)
plt.show()
ax.plot(x,y,'g--')
ax.plot(x,y,linestyle='--',color='g')
plt.plot(np.random.randn(30).cumsum(),'ko--')
plt.show()
#还可以写成更明确的方式
plt.plot(np.random.randn(30).cumsum(),color='k',linestyle='dashed',marker='o')
[
#在线形图中,非实际数据点默认是按线型方式插值的。可以通过drawstyle选项修改
data=np.random.randn(30).cumsum()
plt.plot(data,'ko--',label='Default')
[
plt.plot(data,'k-',drawstyle='steps-post',label='steps-post')
plt.legend(loc='best')
plt.show()
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(randn(1000).cumsum())
[
#要修改X轴的刻度,最简单的办法就是使用set_xticks和set_xticklabels
ticks=ax.set_xticks([0,250,500,750,1000])
labels=ax.set_xticklabels(['one','two','three','four','five'],rotation=30,fontsize='small')
ax.set_xlabel('Stages')
ax.set_title('My first matplotlib plot')
plt.show()
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(randn(1000).cumsum(),'k',label='one')
ax.plot(randn(1000).cumsum(),'k--',label='two')
ax.plot(randn(1000).cumsum(),'k.',label='three')
ax.legend(loc='best')
plt.show()
#text可以将文本绘制在图表的指定坐标(x,y)
ax.text(x,y,'Hello world',family='monospace',fontsize=10)
#在图表中添加一个图形,需要创建一个块对象shp,然后通过ax.add_patch(shp)将其添加到subplot中
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
rect=plt.Rectangle((0.2,0.75),0.4,0.15,color='k',alpha=0.3)
circ=plt.Circle((0.7,0.2),0.15,color='b',alpha=0.3)
pgon=plt.Polygon([[0.15,0.15],[0.35,0.4],[0.2,0.6]],color='g',alpha=0.5)
ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)
plt.show()
plt.savefig('figpath.png',dpi=400,bbox_inches='tight')
#savefig并非一定要写入磁盘,也可以写入任何文件型的对象,比如StringIO
from io import StringIO
plt.savefig(buffer)
plot_data=buffer.getvaule()
#这对在Web上提供动态生成的图片是很实用的
#将全局的图形默认大小设置为10x10
plt.rc('figure',figsize=(10,10))
#将配置写成字典
font_opinions={'family':'monospace','weight':'bold','size':'samll'}
plt.rc('font',**font_options)
提醒,关于这部分内容参考最新的pandas在线文档是最好的学习方式
#Series和DataFrame都有一个用于生成各类图表的plot方法,默认情况下,它们所生成的是线型图
from pandas import Series,DataFrame
s=Series(randn(10).cumsum(),index=np.arange(0,100,10))
s.plot()
plt.show()
#该Series的索引值会被传给matplotlib,并用以绘制X轴。可以通过use_index=False禁用。X轴的刻度和界限可以通过xticks和xlim选项进行调节
#Y轴就用yticks和ylim
#pandas的大部分糊涂方法都有一个可选的ax参数,它可以是一个matplotlib的subplot对象,能使你在网络布局中更为灵活地处理subplot的位置
#DataFrame的plot方法会在一个subplot中为各列绘制一条线,并自动创建图例
df=DataFrame(randn(10,4).cumsum(0),columns=['A','B','C','D'],index=np.arange(0,100,10))
df.plot()
plt.show()
在生成线型图的代码中加上kind=’bar’(垂直柱状图)或kind=’barh’(水平柱状图)即可生成柱状图,这时,Series和DataFrame的索引会被用作X(bar)或Y(barh)刻度
fig,axes=plt.subplots(2,1)
data=Series(np.random.rand(16),index=list('abcdefghijklmnop'))
data.plot(kind='bar',ax=axes[0],color='k',alpha=0.7)
data.plot(kind='barh',ax=axes[1],color='k',alpha=0.7)
plt.show()
#对于DataFrame,柱状图会将每一行的值分为一组
df=DataFrame(np.random.rand(6,4),index=['one','two','three','four','five','six'],columns=['A','B','C','D'])
df
A | B | C | D | |
---|---|---|---|---|
one | 0.605969 | 0.392503 | 0.159506 | 0.689187 |
two | 0.706356 | 0.548750 | 0.489465 | 0.886399 |
three | 0.539584 | 0.598980 | 0.482615 | 0.478261 |
four | 0.277114 | 0.683394 | 0.407497 | 0.671090 |
five | 0.201349 | 0.797898 | 0.454740 | 0.355270 |
six | 0.113781 | 0.288068 | 0.597394 | 0.130346 |
df.plot(kind='bar')
plt.show()
df.plot(kind='barh',stacked=True)
plt.show()
#柱状图还有一个非常不错的用户:利用value_counts图形化显示Series中各值出现频率,比如s.value_counts().plot(kind='bar')
#小栗子
import pandas as pd
tips=pd.read_csv('ch08/tips.csv')
party_counts=pd.crosstab(tips.day,tips['size'])#如果通过tips.size,取到的是一整列的和
party_counts
size | 1 | 2 | 3 | 4 | 5 | 6 |
---|---|---|---|---|---|---|
day | ||||||
Fri | 1 | 16 | 1 | 1 | 0 | 0 |
Sat | 2 | 53 | 18 | 13 | 1 | 0 |
Sun | 0 | 39 | 15 | 18 | 3 | 1 |
Thur | 1 | 48 | 4 | 5 | 1 | 3 |
party_counts=party_counts.ix[:,2:5]
#然后进行规格化,使得各行的和为1(必须转成浮点数)
party_pcts=party_counts.div(party_counts.sum(1).astype(float),axis=0)
party_pcts
size | 2 | 3 | 4 | 5 |
---|---|---|---|---|
day | ||||
Fri | 0.888889 | 0.055556 | 0.055556 | 0.000000 |
Sat | 0.623529 | 0.211765 | 0.152941 | 0.011765 |
Sun | 0.520000 | 0.200000 | 0.240000 | 0.040000 |
Thur | 0.827586 | 0.068966 | 0.086207 | 0.017241 |
party_pcts.plot(kind='bar',stacked=True)
plt.show()
#通过该数据集可以看出,聚会规模则周末会变大
tips['tip_pct']=tips['tip']/tips['total_bill']
tips['tip_pct'].hist(bins=50)
plt.show()
#密度图 kind='kde'
tips['tip_pct'].plot(kind='kde')
plt.show()
#接下来看一个由两个不同的标准正态分布组成的双峰分布
comp1=np.random.normal(0,1,size=200)#N(0,1)
comp2=np.random.normal(10,2,size=200)# (10,4)
values=Series(np.concatenate([comp1,comp2]))
values.hist(bins=100,alpha=0.3,color='k',normed=True)
values.plot(kind='kde',style='k--')
plt.show()
scatterplot观察两个一维数组序列之间关系的有效手段
macro=pd.read_csv('ch08/macrodata.csv')
data=macro[['cpi','m1','tbilrate','unemp']]
#选择其中几列,计算对数差
trans_data=np.log(data).diff().dropna()
trans_data[-5:]
cpi | m1 | tbilrate | unemp | |
---|---|---|---|---|
198 | -0.007904 | 0.045361 | -0.396881 | 0.105361 |
199 | -0.021979 | 0.066753 | -2.277267 | 0.139762 |
200 | 0.002340 | 0.010286 | 0.606136 | 0.160343 |
201 | 0.008419 | 0.037461 | -0.200671 | 0.127339 |
202 | 0.008894 | 0.012202 | -0.405465 | 0.042560 |
plt.scatter(trans_data['m1'],trans_data['unemp'])
plt.title('Cahnges in log %s vs. log %s '%('m1','unemp'))
plt.show()
pd.scatter_matrix(trans_data,diagonal='kde',alpha=0.3)
plt.show()
import pandas as pd
data=pd.read_csv('ch08/Haiti.csv')
data.head()
Serial | INCIDENT TITLE | INCIDENT DATE | LOCATION | DESCRIPTION | CATEGORY | LATITUDE | LONGITUDE | APPROVED | VERIFIED | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 4052 | * URGENT * Type O blood donations needed in #J… | 05/07/2010 17:26 | Jacmel, Haiti | Birthing Clinic in Jacmel #Haiti urgently need… | 1. Urgences | Emergency, 3. Public Health, | 18.233333 | -72.533333 | YES | NO |
1 | 4051 | Food-Aid sent to Fondwa, Haiti | 28/06/2010 23:06 | fondwa | Please help food-aid.org deliver more food to … | 1. Urgences | Emergency, 2. Urgences logistiqu… | 50.226029 | 5.729886 | NO | NO |
2 | 4050 | how haiti is right now and how it was during t… | 24/06/2010 16:21 | centrie | i feel so bad for you i know i am supposed to … | 2. Urgences logistiques | Vital Lines, 8. Autr… | 22.278381 | 114.174287 | NO | NO |
3 | 4049 | Lost person | 20/06/2010 21:59 | Genoca | We are family members of Juan Antonio Zuniga O… | 1. Urgences | Emergency, | 44.407062 | 8.933989 | NO | NO |
4 | 4042 | Citi Soleil school | 18/05/2010 16:26 | Citi Soleil, Haiti | We are working with Haitian (NGO) -The Christi… | 1. Urgences | Emergency, | 18.571084 | -72.334671 | YES | NO |
data[['INCIDENT DATE','LATITUDE','LONGITUDE']][:10]
INCIDENT DATE | LATITUDE | LONGITUDE | |
---|---|---|---|
0 | 05/07/2010 17:26 | 18.233333 | -72.533333 |
1 | 28/06/2010 23:06 | 50.226029 | 5.729886 |
2 | 24/06/2010 16:21 | 22.278381 | 114.174287 |
3 | 20/06/2010 21:59 | 44.407062 | 8.933989 |
4 | 18/05/2010 16:26 | 18.571084 | -72.334671 |
5 | 26/04/2010 13:14 | 18.593707 | -72.310079 |
6 | 26/04/2010 14:19 | 18.482800 | -73.638800 |
7 | 26/04/2010 14:27 | 18.415000 | -73.195000 |
8 | 15/03/2010 10:58 | 18.517443 | -72.236841 |
9 | 15/03/2010 11:00 | 18.547790 | -72.410010 |
#CATEGORY字段含有一组以逗号分隔的代码,这些代码表示消息的类型
data['CATEGORY'][:6]
0 1. Urgences | Emergency, 3. Public Health, 1 1. Urgences | Emergency, 2. Urgences logistiqu… 2 2. Urgences logistiques | Vital Lines, 8. Autr… 3 1. Urgences | Emergency, 4 1. Urgences | Emergency, 5 5e. Communication lines down, Name: CATEGORY, dtype: object
data.describe()
Serial | LATITUDE | LONGITUDE | |
---|---|---|---|
count | 3593.000000 | 3593.000000 | 3593.000000 |
mean | 2080.277484 | 18.611495 | -72.322680 |
std | 1171.100360 | 0.738572 | 3.650776 |
min | 4.000000 | 18.041313 | -74.452757 |
25% | 1074.000000 | 18.524070 | -72.417500 |
50% | 2163.000000 | 18.539269 | -72.335000 |
75% | 3088.000000 | 18.561820 | -72.293570 |
max | 4052.000000 | 50.226029 | 114.174287 |
#清除错误位置信息并移除缺失分类信息
data=data[(data.LATITUDE>18)&(data.LATITUDE<20)&(data.LONGITUDE>-75)
&(data.LONGITUDE<-70)&(data.CATEGORY.notnull())]
def to_cat_list(catstr):
stripped=(x.strip() for x in catstr.split(','))
return [x for x in stripped if x]
def get_all_categories(cat_series):
cat_sets=(set(to_cat_list(x)) for x in cat_series)
return sorted(set.union(*cat_sets))
def get_english(cat):
code,names=cat.split('.')
if '|' in names:
names=names.split('|')[1]
return code,names.strip()
get_english('2. Urgences logistique |Vital Lines')
(‘2’, ‘Vital Lines’)
#做一个将编码跟名称映射起来的字典,我们用编码进行分析
all_cats=get_all_categories(data.CATEGORY)
#生成器表达式
english_mapping=dict(get_english(x) for x in all_cats)
english_mapping['2a']
‘Food Shortage’
english_mapping['6c']
‘Earthquake and aftershocks’
#抽取出唯一的分类编码,构造一个权零DataFrame
from pandas import DataFrame
def get_code(seq):
return [x.split('.')[0] for x in seq if x]
all_codes=get_code(all_cats)
code_index=pd.Index(np.unique(all_codes))
dummy_frame=DataFrame(np.zeros((len(data),len(code_index))),index=data.index,columns=code_index)
dummy_frame.head()
1 | 1a | 1b | 1c | 1d | 2 | 2a | 2b | 2c | 2d | … | 7c | 7d | 7g | 7h | 8 | 8a | 8c | 8d | 8e | 8f | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 45 columns
#将各行中适当的项设置为1,然后再与data进行连接
for row,cat in zip(data.index,data.CATEGORY):
codes=get_code(to_cat_list(cat))
dummy_frame.ix[row][codes]=1
data=data.join(dummy_frame.add_prefix('category_'))
data.head()
Serial | INCIDENT TITLE | INCIDENT DATE | LOCATION | DESCRIPTION | CATEGORY | LATITUDE | LONGITUDE | APPROVED | VERIFIED | … | category_7c | category_7d | category_7g | category_7h | category_8 | category_8a | category_8c | category_8d | category_8e | category_8f | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4052 | * URGENT * Type O blood donations needed in #J… | 05/07/2010 17:26 | Jacmel, Haiti | Birthing Clinic in Jacmel #Haiti urgently need… | 1. Urgences | Emergency, 3. Public Health, | 18.233333 | -72.533333 | YES | NO | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 4042 | Citi Soleil school | 18/05/2010 16:26 | Citi Soleil, Haiti | We are working with Haitian (NGO) -The Christi… | 1. Urgences | Emergency, | 18.571084 | -72.334671 | YES | NO | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 | 4041 | Radio Commerce in Sarthe | 26/04/2010 13:14 | Radio Commerce Shelter, Sarthe | i’m Louinel from Sarthe. I’d to know what can … | 5e. Communication lines down, | 18.593707 | -72.310079 | YES | NO | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6 | 4040 | Contaminated water in Baraderes. | 26/04/2010 14:19 | Marc near Baraderes | How do we treat water in areas without Pipe?\t… | 4. Menaces | Security Threats, 4e. Assainissem… | 18.482800 | -73.638800 | YES | NO | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7 | 4039 | Violence at "arcahaie bas Saint-Ard" | 26/04/2010 14:27 | unable to find "arcahaie bas Saint-Ard&qu… | Goodnight at (arcahaie bas Saint-Ard) 2 young … | 4. Menaces | Security Threats, | 18.415000 | -73.195000 | YES | NO | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 55 columns
from mpl_toolkits.basemap import Basemap
def basic_haiti_map(ax=None,lllat=17.25,urlat=20.25,lllon=-75.0,urlon=-71.0):
#创建极球面投影Basemap实例
m=Basemap(ax=ax,projection='stere',lon_0=(urlon+lllon)/2,
lat_0=(urlat+lllat)/2,llcrnrlat=lllat,
urcrnrlat=urlat,llcrnrlon=lllon,urcrnrlon=urlon,resolution='f')
#绘制海岸线,州界、国界以及地图边界
m.drawcoastlines()
m.drawstates()
m.drawcounties()
return m
#对于每一个分类,在数据集中找到对应的坐标,并在适当的subplot中绘制一个Basemap,转换坐标,然后通过Basemap的plot方法绘制点
fig,axes=plt.subplots(nrows=2,ncols=2,figsize=(12,10))
fig.subplots_adjust(hspace=0.05,wspace=0.05)
to_plot=['2a','1','3c','7a']
lllat=17.25
urlat=20.25
lllon=-75
urlon=-71
for code,ax in zip(to_plot,axes.flat):
m=basic_haiti_map(ax,lllat=lllat,urlat=urlat,lllon=lllon,urlon=urlon)
cat_data=data[data['category_%s' % code]==1]
#计算地图的投影坐标
x,y=m(list(cat_data.LONGITUDE),list(cat_data.LATITUDE))
m.plot(x,y,'k.',alpha=0.5)
ax.set_title('%s:%s' % (code,english_mapping[code]))
plt.show()