N175_标记异常值_自动汇聚某单位的住房公积金有补发有增减如何批量找出来

本文主要是介绍N175_标记异常值_自动汇聚某单位的住房公积金有补发有增减如何批量找出来，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']    #定义使其正常显示中文字体黑体
plt.rcParams['axes.unicode_minus'] = False      #用来正常显示表示负号 
data = pd.read_excel(r'.\N175_标记异常值_数据源.xlsx')
import seaborn as sns
# 使用seaborn获取颜色
cm = sns.light_palette("green", as_cmap=True)
# 色阶实现
data.head(100).style.background_gradient(cmap=cm,subset=["发生额"])
# data

	laiyuan	年	月	日期	交易流水号	单位代码	公积金账号	交易号	客户编号	客户姓名	发生额	余额	摘要代码	摘要	凭证种类	凭证号
0	./数据源/2012\201208.xls	2012	8	2012-08-01 00:00:00	流水号0000001	3267	27234556	缴存	210102198901101814#0	姓名001	9340	39720.410000	1072.000000	单位补缴201208	nan	nan
1	./数据源/2012\201208.xls	2012	8	2012-08-01 00:00:00	流水号0000002	3267	27234557	缴存	43282819810118001X#0	姓名002	990	47718.610000	1072.000000	单位补缴201208	nan	nan
2	./数据源/2012\201208.xls	2012	8	2012-08-01 00:00:00	流水号0000003	3267	27234558	缴存	432801198108011031#0	姓名003	1106	42583.660000	1072.000000	单位补缴201208	nan	nan
3	./数据源/2012\201208.xls	2012	8	2012-08-01 00:00:00	流水号0000004	3267	27234559	缴存	432801198710011099#0	姓名004	748	32592.650000	1072.000000	单位补缴201208	nan	nan
4	./数据源/2012\201208.xls	2012	8	2012-08-01 00:00:00	流水号0000005	3267	27234560

limit_output extension: Maximum message size of 10000 exceeded with 132577 characters

df = data[["laiyuan","日期","客户姓名","发生额"]]
df.head(100).style.background_gradient(cmap=cm,subset=["发生额"])  #主要是靠这些列

	laiyuan	日期	客户姓名	发生额
0	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名001	9340
1	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名002	990
2	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名003	1106
3	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名004	748
4	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名005	1070
5	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名006	1196
6	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名007	826
7	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名008	854
8	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名009	832
9	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名010	1060
10	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名011	798
11	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名012	808
12	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名013	840
13	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名014	790
14	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名015	760
15	./数据源/2012\201208.xls	2012-08-01 00:00:00	姓名016	940
16	./数据源/2012\201208.xls	2012-08-01 00:00:00

limit_output extension: Maximum message size of 10000 exceeded with 43779 characters

# 透视表 交叉统计
tongji01=pd.pivot_table(data, index=['laiyuan','日期'], columns='客户姓名', values='发生额',margins=False, aggfunc=[np.sum])
# 最后，要扁平化MultiIndex列，我们可以只连接元组中的值：
tongji01.columns = ['_'.join(col).replace("sum_","") for col in tongji01.columns.values]
tongji01 = tongji01.reset_index()
tongji01.head(10)

	laiyuan	日期	姓名001	姓名002	姓名003	姓名004	姓名005	姓名006	姓名007	姓名008	...	姓名073	姓名074	姓名075	姓名076	姓名077	姓名078	姓名079	姓名080	姓名081	姓名082
0	./数据源/2012\201208.xls	2012-08-01	9340.0	990.0	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	./数据源/2012\201209.xls	2012-09-01	954.0	990.0	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	./数据源/2012\201210.xls	2012-10-01	934.0	990.0	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	./数据源/2012\201211.xls	2012-11-01	934.0	990.0	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	./数据源/2012\201212.xls	2012-12-01	NaN	NaN	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	./数据源/2013\201301.xls	2013-01-01	NaN	NaN	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	./数据源/2013\201302.xls	2013-02-01	NaN	NaN	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
7	./数据源/2013\201303.xls	2013-03-01	NaN	NaN	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
8	./数据源/2013\201304.xls	2013-04-01	NaN	NaN	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
9	./数据源/2013\201305.xls	2013-05-01	NaN	NaN	1106.0	748.0	1070.0	1196.0	826.0	854.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

10 rows × 84 columns

# 按列找异常值，并用语言描述出来
import pandas as pd
cun = tongji01.copy()  #将数据块复制，后面发现的异常值进行覆盖
hangzuobiao = tongji01.laiyuan.to_list() # 将来源列转列表
suo = tongji01   #.iloc[:,1:4] #   为了缩小范围，提高实验速度
print(suo.shape)
lieming = suo.columns.to_list() #将列标题转列表
for index,col in enumerate(lieming): # 遍历列名，lie_max = suo[col].value_counts().index[:1].values # 关键点，找到该列频率最高的值gailie_max = lie_max[0] # 取当前值，上面计算出来的是列表for hang in range(suo.shape[0]): # 按行总数遍历 ，if suo[col].dtype in ["int64", "float64"]: #判断是否都是数字danqian = suo.loc[hang,col] # 取出当前值if pd.isnull(danqian) == True or danqian == gailie_max: # 为空跳过，相等跳过continue
#             print("当前值："+str(danqian))
#             print("当列出现次数最多的："+str(gailie_max))yichang = (gailie_max-danqian)/gailie_max
#             print(yichang)if abs(yichang)>0.6: # 差额相除取绝对值，看是否超过6成zhi = "异常"+str(hangzuobiao[hang])[:10]+"_"+str(col)+"_："+str(suo.loc[hang,col])cun.iloc[hang,index] = zhi
cun.to_excel("N175_标记异常值_结果.xlsx")

(107, 84)

from styleframe import StyleFrame
import openpyxl
#可以自我覆盖，但是这样有风险，为了列宽自适应，筛选，冻结窗口
file_path = "N175_标记异常值_结果.xlsx"
df=pd.read_excel(file_path)
col = df.columns.values.tolist()
excel_writer = StyleFrame.ExcelWriter(file_path)
sf = StyleFrame(df)
sf.to_excel(excel_writer=excel_writer, best_fit=col,columns_and_rows_to_freeze='a2', row_to_add_filters=0,
)
excel_writer.save()
# 将句柄关闭，否则是只读格式，打不开
excel_writer.handles = None  #这句代码有用，很久才找到
print("well done")