爬虫数据分析------Pandas和Matplotlib

发布时间：2024-08-29 18:01

pandas 是基于NumPy 的一种工具，该工具是为了解决数据分析任务而创建的。Pandas 纳入了大量库和一些标准的数据模型，提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。你很快就会发现，它是使Python成为强大而高效的数据分析环境的重要因素之一。

Pandas数据类型

Series [ˈsɪəri:z] ：一维数组，与Numpy中的一维array类似。二者与Python基本的数据结构List也很相近，其区别是：List中的元素可以是不同的数据类型，而Array和Series中则只允许存储相同的数据类型，这样可以更有效的使用内存，提高运算效率。
Time- Series：以时间为索引的Series。
DataFrame：二维的表格型数据结构。很多功能与R中的data.frame类似。可以将DataFrame理解为Series的容器。以下的内容主要以DataFrame为主。
Panel ：三维的数组，可以理解为DataFrame的容器。

Pandas和matplotlib、sqlalchemy的安装

安装pandas输入：sudo pip3 install pandas

安装matplotlib输入：sudo pip3 install matplotlib

安装sqlalchemy输入：sudo pip3 install sqlalchemy

使用代码加载数据文件gapminder.tsv

装置数据
import pandas

#文件后缀名任意,并不是csv的文件一定要是csv结尾
#一定要加上sep=\'\\t\',否则报错;加上的意思是以\\t分开内容
df = pandas.read_csv(\"gapminder.tsv\",sep=\'\\t\')
print(df)
#,pandas中的数据类型,相当于二维数组

#得到前面n行#默认返回前5行：tr = df.head()
#指定返回10行：tr =df.head(10)
#返回的类型还是DataFrme：print(type(df.head()))
#获取二维表的维度,他是属性,返回数据是元组：data = df.shape#(1704, 6)
#获取有多少条数据：data = df.shape[0]#1704

#获取有多少列：sum = df.shape[1]#6

#获取列名print(df.columns)

#Index([\'country\', \'continent\', \'year\', \'lifeExp\', \'pop\', \'gdpPercap\'], dtype=\'object\')

#遍历得到所以列
for column in df.columns:
print(column)
#得到所以列的类型print(df.dtypes)

for a in zip(df.columns,df.dtypes):
print(a)
#列和类型合并并且转换成字典
columsTypes = dict(zip(df.columns,df.dtypes))
print(columsTypes.get(\"country\"))

print(columsTypes.get(\"pop\"))

pandas类型(dtype)和python类型(type)对照
pandas类型 python类型
object string
int64 int
float64 float

datetime64 datetime

#返回country列数据：country_df = df[\"country\"]#返回显示只有一部分,如果for循环可以全部打印出来

#同样也是默认返回5条：print(country_df.head())

#返回多列：subset = df[[\"country\",\"continent\",\"year\"]]

4. 查看数据集的行

#得到第一行：print(df.loc[0])

#得到第10行：print(df.loc[10])

#得到总行数：number_of_raws = df.shape[0] print(number_of_raws)#1704

#得到最后一行：last_raws = df.loc[number_of_raws-1]

#得到是最后一行的第三个数据：last_raws = df.loc[number_of_raws-1].head(3)

#得到多行:2,4,6行：print(df.loc[[2,4,6]])

iloc的使用,支持-1去最后一条数据

print(df.iloc[-1]) #loc和iloc,其中loc不支持-1

#取多条数据：print(df.iloc[[1,3,5]])

5. 查看数据集单元格中的数据

#根据列名得到指定的列

：是对行不切片,全部要;逗号后面跟过来条件,把要的列写上

subset = df.loc[:,[\"year\",\"pop\"]]

#根据列的下标得到指定的列,loc不支列的下标

#得到2,4和最后一列数据：subset = df.iloc[:,[2,4,-1]]

#通过索引得到指定范围的单元格数据

#切片得到下标是3到6的列对应的数据：subset = df.iloc[:,3:6]#行不包含后面的索引

#得到一行，下标是3到6的列对应的数据：subset = df.iloc[1,3:6]#行不包含后面的索引

#使用loc,取下标为1的行,并且列为year：subset = df.loc[1,[\"year\"]]

#使用loc,取下标为0到3的行,并且列为year和pop：subset = df.loc[0:3,[\"year\",\"pop\"]]#注意行包含后面的索引

#得到不连续的单元格数据,得到9个单元个数据：subset = df.iloc[[1,3,5],[0,2,4]]

2. 对数据集进行分组统计

#对数据进程分组统计
import pandas
df = pandas.read_csv(\"gapminder.tsv\",sep=\'\\t\')
#按照年份分组统计,全世界中各年预期寿命的平均值
print(df.groupby(\"year\")[\"lifeExp\"].mean())

#返回SeriesGroupBy：print(type(df.groupby(\"year\")[\"lifeExp\"]))

#多组统计,按照年份和州分组,统计预期寿命和人均GDP(单位美元)

multi_group_var = df.groupby([\"year\",\"continent\"])[[\"lifeExp\",\"gdpPercap\"]].mean()

#返还成索引的状态--也就是二维表格式DataFrame：print(multi_group_var.reset_index())

#统计数量,统计州国家的数量：df.groupby(\"continent\")[\"country\"].nunique()

可视化统计数据（依赖Matplotlib）

显示全球年平均寿命

import pandas
import matplotlib.pyplot as plot

df = pandas.read_csv(\"gapminder.tsv\",sep=\'\\t\')
#全球年平均寿命
global_yearly_life_expectancy = df.groupby(\"year\")[\"lifeExp\"].mean()
print(global_yearly_life_expectancy)
#使用matplotlib可视化显示--一维表数据
global_yearly_life_expectancy.plot()
#显示示例
plot.legend()
#显示标题
plot.title(\"全球年平均寿命\")
#显示
plot.show()
import matplotlib.pyplot as plot

df = pandas.read_csv(\"gapminder.tsv\",sep=\'\\t\')
#全球年平均寿命
global_yearly_life_expectancy = df.groupby(\"year\")[\"lifeExp\"].mean()
print(global_yearly_life_expectancy)
#使用matplotlib可视化显示--一维表数据
global_yearly_life_expectancy.plot()
#显示示例
plot.legend()
#显示标题
plot.title(\"全球年平均寿命\")
#显示
plot.show()

把两个坐标显示在一张图上

import pandas
import matplotlib.pyplot as plot

df = pandas.read_csv(\"gapminder.tsv\",sep=\'\\t\')
#全球年平均寿命
global_yearly_life_expectancy = df.groupby(\"year\")[\"lifeExp\"].mean()
print(global_yearly_life_expectancy)

#全球年人均GDP
global_gdppercap_life_expectancy = df.groupby(\"year\")[\"gdpPercap\"].mean()
print(global_gdppercap_life_expectancy)
#把两个坐标放在一个图上显示
#1,2的意思是1行两列显示;(8,4)宽可以理解为800像素高400像素
gig,(ax1,ax2) = plot.subplots(1,2,figsize=(8,4))
# gig,(ax1,ax2) = plot.subplots(2,1,figsize=(4,8))
#Figure(800x400)
print(gig)
#设置数据
ax1.plot(global_yearly_life_expectancy)
ax2.plot(global_gdppercap_life_expectancy)
#显示标题
ax1.set_title(\"全球年平均寿命\")
ax2.set_title(\"全球年人均GDP\")
#显示示例
ax1.legend()
ax2.legend()
#开始绘制
plot.show()
#把两个坐标放在一个图上显示
#1,2的意思是1行两列显示;(8,4)宽可以理解为800像素高400像素
gig,(ax1,ax2) = plot.subplots(1,2,figsize=(8,4))
# gig,(ax1,ax2) = plot.subplots(2,1,figsize=(4,8))
#Figure(800x400)
print(gig)
#设置数据
ax1.plot(global_yearly_life_expectancy)
ax2.plot(global_gdppercap_life_expectancy)
#显示标题
ax1.set_title(\"全球年平均寿命\")
ax2.set_title(\"全球年人均GDP\")
#显示示例
ax1.legend()
ax2.legend()
#开始绘制
plot.show()

解决Matplotlib乱码问题

第一步:配置雅黑字体

1)下载字体:msyh.ttf (微软雅黑)

下载地址：http://vdisk.weibo.com/s/voqVOI51n-e，也可以直接使用下载好的

2)放在matplotlib的字体文件夹下:

/usr/local/lib/python3.5/dist-packages/matplotlib/mpl-data/fonts/ttf/

第二步:修改matplotlib配置文件:

sudo subl/usr/local/lib/python3.5/dist-packages/matplotlib/mpl-data/matplotlibrc 删除font.family和font.sans-serif两行前的#,并在font.sans-serif后添加中文字体 Microsoft YaHei, ...(其余不变)

执行命令打开配置文件：

sudo subl/usr/local/lib/python3.5/dist-packages/matplotlib/mpl-data/matplotlibrc

第三步：正常显示负号

添加完字体名字，再找到#axes.unicode_minus，去掉“#”，并把“True”改成“False”，这样就可以正常显示负号。

Pandas中的数据类型

1. 创建Series from pandas import Series

#按照float类型处理：s1 = Series([33,44,55])
#按照float类型处理：s2 = Series([33,44.5,88])

#只有都是bool类型,里面就会用bool存储数据：s3 = Series([False,True])

#使用object存储数据：s4 = Series([33,44.5,88,False,True])

#修改索引,默认是数字：ss = Series([\"阿福\",\"硅谷老师\"],index=[\"姓名\",\"职位\"])

2. 创建DataFrame from pandas import DataFrame

#默认字典是顺序是乱的,只有指定才固定顺序
#如果不存在的列会显示NaN,如果数据不成对会报错
students = DataFrame({
   \"names\":[\"张三\",\"李四\"],
   \"position\":[\"班长\",\"学习委员\"],
   \"birthday\":[\"1993-04-06\",\"1995-05-09\"],
   \"died\":[\"2090-04-06\",\"2090-05-09\"],
   \"age\":[97,95]
},columns=[\"position\",\"birthday\",\"died\",\"age\"]
 ,index=[\"张三\",\"李四\"]#用name指定成索引
)

使用有序的字典：from collections import OrderedDict

students = DataFrame(OrderedDict([
   (\"names\",[\"张三\",\"李四\"]),
   (\"position\",[\"班长\",\"学习委员\"]),
   (\"birthday\",[\"1993-04-06\",\"1995-05-09\"]),
   (\"died\",[\"2090-04-06\",\"2090-05-09\"]),
   (\"age\",[97,95])
]))

3. DataFrame的基本操作

#返回索引Index([\'张三\', \'李四\'], dtype=\'object\')：print(students.index)

#得到第一行数据：first_raw = students.loc[\"age\"]

#得到第一行的索引：print(first_raw.index)
#返回Index([\'position\', \'birthday\', \'died\', \'age\'], dtype=\'object\')
#和上面等价：print(first_raw.keys())

#第一行得到值列表：print(first_raw.values)

#得到第一行索引的第一个索引position：print(first_raw.index[0])

4. Series的方法

平均值： students.loc[\"age\"].mean()
最大值： students.loc[\"age\"].max()
最小值： students.loc[\"age\"].min()
标准差： students.loc[\"age\"].std()
排序ascendinf=False从大到小，Ture从小到大： students.loc[\"age\"].sort_values(ascendinf=False)
#Series添加Series数据：print(students.loc[\"age\"].append(ages))
#绘制图形：students.loc[\"age\"].hist()
#显示：show()

5. Series的条件过滤

from pandas import read_csv

scientists = read_csv(\"scientists.csv\")
print(scientists)
ages = scientists[\"Age\"]
print(ages)
#describe()描述方法,返回一些基本的信息max，min，mean，std
print(ages.describe())
ages = scientists[\"Age\"]
print(ages)
#describe()描述方法,返回一些基本的信息max，min，mean，std
print(ages.describe())

#describe()描述方法,返回一些基本的信息：print(ages.describe())

#得到大于平均年龄(59.125000)的数据：print(ages[ages > ages.mean()])

#控制哪行显示,哪行不显示，true显示，False不显示：print(ages[[True,False,True,True,True,True,False,True]])

6. Series的向量操作

from pandas import read_csv,Series
scientists = read_csv(\"scientists.csv\")
#Series对象
ages = scientists[\"Age\"]

#对应位置的数相加：print(ages + ages)

#加上数值：print(ages + 100)

#加上一个只有一部分数据的Series,只显示有数据的,其他部分用NaN来填充缺失值：print(ages + Series([2,100]) )

import numpy
#要对应相加： print(ages + numpy.array([1,2,3]))

7 .DataFrame的条件过滤

from pandas import read_csv,Series
#返回的是DataFrame对象
scientists = read_csv(\"scientists.csv\")

#返回年龄大于平均年龄的数据：print(\"年龄大于平均年龄的数据:\\n\",scientists[scientists[\'Age\'] > scientists[\"Age\"].mean()])

#控制数据显示：print(\"控制数据显示\\n\",scientists.loc[[True,True,False,False,True]])

#控制索引显示：print(\"iloc控制索引显示\\n\",scientists.iloc[[1,3,4]])

#只要\"Name\",\"Age\",\"Occupation\"三列：print(\"对列进行过滤\\n\",scientists[[\"Name\",\"Age\",\"Occupation\"]])

#同时对列好行过滤：lines = scientists[[\"Name\",\"Age\",\"Occupation\"]][scientists[\'Age\'] > scientists[\"Age\"].mean()]

#使用loc再过滤值显示前两行：print(lines.loc[[True,True]])

8. DataFrame的向量操作，可以做对应数据的向量相加减

9 .向DateFrame添加列(类型要一致)

from pandas import read_csv,to_datetime

scientists = read_csv(\"scientists.csv\")
#把类型是object的Born转换成datetime64类型
born_dt = to_datetime(scientists[\"Born\"])
died_dt = to_datetime(scientists[\"Died\"])

列存在就修改不存在就创建一列：scientists[\"born_dt\"]= born_dt

#把born_dt和died_dt添加到scientists中：scientists[\"born_dt\"],scientists[\"died_dt\"] = (born_dt,died_dt)

10. 直接修改DataFrame中列的值

from pandas import read_csv
import random

scientists = read_csv(\"scientists.csv\")

#生成要修改成的数据(列表推倒式)：ages = [random.randint(10,80) for i in range(8)]

#修改一列列：scientists[\"Age\"] = ages

11. 删除DataFrame中的列

#删除列Age和Died列数据：scientists_droped = scientists.drop([\"Age\",\"Died\"],axis=1)

axis默认是0，默认是删除行

12. 读写Pickle格式的Series和DataFrame文件

Pickle格式是二进制格式

from pandas import read_pickle,read_csv
import os

scientists = read_csv(\"scientists.csv\")
# print(scientists)
#把所以的内容转换成pickle格式
scientists_pickle_path = \"./output/scientists_pickle.pickle\"
if not os.path.exists(scientists_pickle_path):
   scientists_pickle = scientists.to_pickle(scientists_pickle_path)

#把Name列转换成pickle格式
scientists_name_path = \"./output/scientists_name.pickle\"
scientists_name = scientists[\"Name\"]
if not os.path.exists(scientists_name_path):
   scientists_pickle = scientists_name.to_pickle(scientists_name_path)#数据保存到scientists_name_path 文件

#读取pickle格式并且打印出来
pickle = read_pickle(\"./output/scientists_pickle.pickle\")
print(pickle)
print(\"-\"*60)
name = read_pickle(\"./output/scientists_name.pickle\")
print(name)

# 把索引去掉
scientists_name_series_path = \"./output/scientists_name_series3.csv\"
if not os.path.exists(scientists_name_series_path):
# 把分割符号修改成*

scientists_name_series.to_csv(scientists_name_series_path, sep=\"*\", index=False)

三、连接与合并数据集

#把三个DataFrame数据以行连接起来,相当于数据库中的 union all：raw_concat = concat([s1,s2,s3])

#新的列索引接着开始：print(concat([s1,new_row_df],ignore_index=True))

#axis是0行连接，axis是1是列连接：col_concat = pd.concat([df1,df2,df3],axis = 1)

# 添加一列：col_concat[\'new_col_list\'] = [\'n1\',\'n2\',\'n3\',\'n4\']

#axis=1是列连接，ignore_index忽略索引：col_concat = pd.concat([df1,df2,df3],axis=1,ignore_index=True)

拥有不同列的DataFrame的行连接

四、Matplotlib 基础知识

1 Matplotlib简介（绘制第一个图形）

from matplotlib.pyplot import plot,show,savefig

#绘制y = x^2
X = range(-100,100)
Y = [x**2 for x in X]
plot(X,Y)
#保存图片
savefig(\"result.png\")
#显示图片
show()

2.在数据可视化的过程中使用NumPy（绘制正弦和余弦曲线）

绘制正弦

from matplotlib.pyplot import plot,show
import math, numpy

#范围0~2π,共100等分
X = numpy.linspace(0,2*numpy.pi,100)
#绘制sin图像
Y = numpy.sin(X)
plot(X,Y)
show()

绘制余弦

from matplotlib.pyplot import plot,show
import math, numpy

#范围0~2π,共100等分
X = numpy.linspace(0,2*numpy.pi,100)
#绘制sin图像
Y = numpy.cos(X)
plot(X,Y)
show()

将文件作为数据源绘制曲线

import numpy
from matplotlib.pyplot import plot,show

#加载数据
data = numpy.loadtxt(\"data.txt\")
#原样输出,6行3列
print(data)
print(\"-\"*50)
#转一下,变成3行6列
print(data.T)
for raw in data.T:
   print(\"X=\",data.T[0])
   print(\"Y=\",raw)
   plot(data.T[0],raw)
#显示    
show()

绘制随机点

import random
import matplotlib.pyplot as plt

#计划生产100个随机的点
count = 1024
X = [random.random() for i in range(count)]
Y = [random.random() for i in range(count)]
print(X)
#绘制连续的
# plt.plot(X,Y)
#绘制随机的
plt.scatter(X,Y)
#显示
plt.show()
 plt.plot(X,Y)
#绘制随机的
plt.scatter(X,Y)
#显示
plt.show()

绘制垂直

from matplotlib.pyplot import plot,show,bar,barh
#bar绘制垂直,barh绘制水平垂直图
#x坐标
X = [0,1,2,3]
#y轴坐标
Y = [22,11,44,15]
#绘制垂直柱状图
bar(X,Y,winth=0.5)#width 控制x轴宽度
#真正绘制
show()

水平柱状图

from matplotlib.pyplot import plot,show,bar,barh

X = [0,1,2,3]
#y轴坐标
Y = [22,11,44,15]
#绘制水平垂直柱状图
# barh(X,Y)
#年份和人均GDP，width=2两个数的差值
barh([1995,1998,2003,2008],[1000,2000,3000,4000],height=2)
#真正绘制
show()

绘制多组垂直柱状图

import matplotlib.pyplot as plt

#准备三组Y轴数据
Y = [
   [11,22,44,66],
   [55,66,77,22],
   [44,55,66,77]
]
#绘制三个垂直柱状图像
plt.bar(range(4),Y[0],width=0.25)
plt.bar([x + 0.25 for x in range(4)],Y[1],width=0.25)
plt.bar([x + 0.5 for x in range(4)],Y[2],width=0.25)
plt.show()

绘制多组水平柱状图

mport matplotlib.pyplot as plt

#准备三组Y轴数据
Y = [
   [11,22,44,66],
   [55,66,77,22],
   [44,55,66,77]
]
#绘制三个水平垂直柱状图像
plt.barh(range(4),Y[0],height=0.25)
plt.barh([x + 0.25 for x in range(4)],Y[1],height=0.25)
plt.barh([x + 0.5 for x in range(4)],Y[2],height=0.25)
plt.show()

在同一个窗口绘制直方图和盒状图

import numpy
import matplotlib.pyplot as plot

#hist:直方图，boxplot:盒状图
#randn函数返回一个或一组样本，具有标准正态分布
data = numpy.random.randn(100)
#把显示分为两个区域
fig,(ax1,ax2) = plot.subplots(1,2,figsize=(8,4))
print(fig)#Figure(800x400)
#直方图，分为100份
ax1.hist(data,100)
#盒状图
ax2.boxplot(data)
#显示
plot.show()

绘制饼状图--pie

import matplotlib.pyplot as plt

#数据
x = [11,33,55,5,66]
#绘制饼状图
plt.pie(x)
#添加示例
plt.legend(x)
#开始绘制
plt.show()

爬虫数据分析------Pandas和Matplotlib

相关推荐