











import numpy as np import pandas as pd # 聚合函数 df_index = ["Tony", "Alex", "Jack", "Mary", "Kelly"] df_columns = ["语文", "数学", "英语", "物理"] df = pd.DataFrame(data=np.random.randint(50,100,size=(5,4)), index=df_index, columns=df_columns) print(df) # 列方向求和 print(df.sum()) # 求每列即每科目分数的总和 print(df.mean()) # 求每列即每科目分数的平均值 # 行方向求和 print(df.sum(axis=1)) # 求每位同学分数的总和 print(df.mean(axis=1)) # 求每位同学分数的平均值 # 求所有行列元素的和 print("df对象所有行列元素的和为:", end="") print(df.sum().sum()) # pandas中空值会被忽略 df.loc["Alex", "物理"] = np.nan print(df) print(df.sum())
classified and organized, the code is as follows:
import numpy as np import pandas as pd # 聚合函数 df_index = ["Tony", "Alex", "Jack", "Mary", "Kelly"] df_columns = ["语文", "数学", "英语", "物理"] df = pd.DataFrame(data=np.random.randint(50,100,size=(5,4)), index=df_index, columns=df_columns) # 设置一个空值,看聚合函数对空值的处理方式 df.loc["Jack", "物理"] = np.nan print("=====初识成绩表(带空值)=====") print(df) # ============================================== # 一、【最常用】统计聚合函数(默认 axis=0:按列统计) # ============================================== print("\n=====1、每科统计(列方向)=====") print("每科总分:") print(df.sum()) # 求每科(列)和,自动跳过(忽略)NaN print("\n每科平均分:") print(df.mean()) # 求每科(列)平均值,自动跳过(忽略)NaN print("\n每科最高分:") print(df.max()) # 求每科(列)最大值 print("\n每科最低分:") print(df.min()) # 求每科(列)最小值,自动跳过(忽略)NaN print("\n每科中位数(中间分数):") print(df.median()) # 求每科(列)中位数 print("\n每科标准差(分数波动情况):") print(df.std()) # 每科标准差,越大越不稳定,自动跳过(忽略)NaN print("\n每科非空人数:") print(df.count()) # 每科非空人数 # ============================================== # 二、按行聚合(每个学生),( axis=1:按行统计) # ============================================== print("\n=====2、每个学生统计(行方向)=====") print("每个学生的总分:") print(df.sum(axis=1)) # 求每行的数据总和,即每个学生的总分,自动跳过(忽略)NaN print("\n每个学生的平均分:") print(df.mean(axis=1)) print("\n每个学生的最高分:") print(df.max(axis=1)) print("\n每个学生分数中位数(中间分数):") print(df.median(axis=1)) print("\n每个学生分数标准差(分数波动情况):") print(df.std(axis=1)) # 标准差越大这个学生各科分数越不稳定,自动跳过(忽略)NaN # ============================================== # 三、全局聚合 # ============================================== print("\n=====3、全局统计(整个表格)=====") print("所有学生全科总分:", df.sum().sum()) print("全班所有科目平均分:", df.mean().mean()) print("全班最高分:", df.max().max()) print("全班最低分:", df.min().min()) print("全班中位数(中间分数):", df.median().median()) print("全班标准差(分数波动情况):", df.std().std()) print("全班非空人数:", df.count().sum()) # ============================================= # 四、重要扩展idxmax()、idxmin()——最大最小值对应的行标签 # ============================================= print("\n=====4、使用扩展=====") print("语文最高分是:", df["语文"].idxmax()) print("英语最低分是:", df["英语"].idxmin()) print("Tony同学的最高分科目是:", df.loc["Tony"].idxmax())
Exercises:
# 班级有张三、李四、王五、赵六四位同学,考试科目有Python、Java、C三门 # 1、假设score1是期中考试成绩,score2是期末考试成绩,请自由创建score1和score2,并将其相加,求期中期末平均值 # 2、假设张三期中考试Java作弊,要记为0分,如何实现? # 3、李四因为举报张三作弊立功,其期中考试所有科目加10分,如何实现? # 4、由于有一道题出错,要给所有学生加10分,如何实现? # 5、Python老师想知道哪些同学Java成绩比Python好,如何实现? import numpy as np import pandas as pd student_list = ["张三", "李四", "王五", "赵六"] subject_list = ["Python", "Java", "C"] score1 = pd.DataFrame( data=[ [78, 72, 58], [89, 87, 77], [87, 91, 76], [93, 95, 98] ], index=student_list, columns=subject_list ) print("=====初始期中考试成绩表=====") print(score1) print("=====初始期中考试各科平均分=====") print(score1.mean()) print("=====初始期中考试各学生平均分=====") print(score1.mean(axis=1)) score2 = pd.DataFrame( data=[ [85, 91, 83], [81, 72, 66], [83, 88, 79], [91, 93, 96] ], index=student_list, columns=subject_list ) print("=====期末考试成绩表=====") print(score2) print("=====期末考试各科平均分=====") print(score2.mean()) print("=====期末考试各学生平均分=====") print(score2.mean(axis=1)) print("=====初始期中和期末考试成绩总和表=====") print(score1+score2) print("=====张三期中考试Java作弊,记为0分=====") score1.loc["张三", "Java"] = 0 print("=====李四举报作弊期中考试各科加分=====") score1.loc["李四"] += 10 # 如果加分后超满分,则设为100分 score1.loc["李四"] = score1.loc["李四"].mask(score1.loc["李四"] > 100, 100) print("=====由于有一道题出错,期中考试所有学生各科加分10分=====") score1 = score1 + 10 # 如果加分后超满分,则设为100分 score1 = score1.mask(score1 > 100, 100) print("=====最终期中考试成绩表=====") print(score1) print("=====最终期中考试各科平均分=====") print(score1.mean()) print("=====最终期中考试各学生平均分=====") print(score1.mean(axis=1)) print("=====最终期中和期末考试成绩总和表=====") print((score1+score2)/2) print("=====期中考试Java成绩比Python好的同学名单=====") condition = score1["Java"] > score1["Python"] print(score1[condition].index.tolist()) print("=====期末考试Java成绩比Python好的同学名单=====") condition = score2["Java"] > score2["Python"] print(score2[condition].index.tolist()) # 理解上容易绕,与对象["列索引"]访问的是列不同,这里筛选的还是行 print(score2.loc[condition].index.tolist()) # 推荐使用loc[]访问行
* Single-level index
import numpy as np import pandas as pd df = pd.DataFrame(data=np.random.randint(0,20,size=(5,3)), columns=list("ABC")) print(df) print(df.index) # 行索引 print(df.columns) # 列索引 print(df.index.tolist()) # 行索引的列表形式 print(df.columns.tolist()) # 列索引的列表形式 # 行列索引的修改、设置 # 常规方法 df.index = list("PQRST") df.columns = ["maths", "english", "physics"] print(df) # 定制方法 # 先通过Index\RangeIndex\DateTimeIndex\etc...创建一个索引对象 index1 = pd.Index(data=["haha", "hehe", "oooo", "lala", "uuuu"], name="SSSS") index2 = pd.RangeIndex(0,10,2,name="numberssss") # 再传递给df.index df.index = index1 print(df) df.index = index2 print(df)
* Multi-level index
# 多层索引 # 列索引的多层 LEVEL1 = ["第一期", "第二期"] LEVEL2 = ["A", "B", "C"] columns = pd.MultiIndex.from_product([LEVEL1, LEVEL2], names=["期数", "产品"]) index = ["lucy", "tom", "alex"] # 上行列索引当然也可以通过如下Index设置 # index = pd.Index(data=["lucy", "tom", "alex"], name="姓名") data = np.random.randint(0, 100, size=(3,6)) df = pd.DataFrame(data=data, index=index,columns=columns) print(df) # 行索引的多层 group = ["Group1", "Group2"] sales = ["Anna", "Lisa", "Mina", "Fiona"] index = pd.MultiIndex.from_product([group, sales], names=["Groups", "Sales"]) columns = pd.Index(data=["A", "B", "C", "D", "E"], name="product") data = np.random.randint(0, 100, size=(8, 5)) df = pd.DataFrame(data=data, index=index, columns=columns) print(df)
Exercises:
Create a DataFrame to represent lucy, tom, jack's scores in each subject (python, java, c) this semester; find out which subject has lucy's highest score. Get tom's scores and calculate the average score of each subject for tom. Get jack's python score and add 20 points.
import numpy as np import pandas as pd score_df = pd.DataFrame( data=np.random.randint(50,100,size=(3,3)), index=["Lucy", "Tom", "Jack"], columns=["Python", "Java", "C"] ) print("=====本学期成绩表如下:=====") print(score_df) print("\n=====Lucy最高分科目如下:=====") print(score_df.loc["Lucy"].idxmax()) print("\n=====Tom各科成绩如下:=====") print(score_df.loc["Tom"].to_dict()) print("\n=====Tom各科平均分为:=====") print(round(score_df.loc["Tom"].mean(),2)) print("\n=====Jack的Python成绩为:=====") print(score_df.loc["Jack", "Python"]) print("\n=====给Jack的Python成绩加上20分:=====") score_df.loc["Jack", "Python"] += 20 print(score_df)
This content is automatically aggregated by InertiaRSS (RSS Reader) for reading reference only. Original from — Copyright belongs to the original author.