current position:Home>Three basic data quality evaluation methods and python implementation

Three basic data quality evaluation methods and python implementation

2022-05-15 05:51:38Broad starfish sink

Basic data quality evaluation method

1、 Saturation of statistical table data ( Null rate ) analysis
2、 Through code table comparison analysis, threshold accuracy analysis ( Correlation between real-time table and dimension table )
3、 Time saturation ( Data continuity ) analysis
4、 Relationship between subject table and behavior table ( No examples yet , Design according to the specific scene )

Null rate of statistical table data

#  brief introduction : Generate the field saturation of a table ( Null rate ) SQL  Script , You can choose to run manually or automatically 
#  author : Wang Zhendong 
#  date :2021-02-07
from odps import ODPS

o = ODPS('ak', 'sk', 'project_namme', endpoint='http://xxxxx/api')

def check_data_by_execute_sql(table_name, partition):
    ta = o.get_table(table_name)
    sql_str = 'select \n'
    for col in ta.schema.columns:
        col_name = col.name
        col_comm = col.comment
        if col_comm == 'null' or col_comm is None or col_comm == '':
            continue
        sql_str += "sum(case when (%s is null) or (%s in ('', 'null', 'NULL', '-')) or " \
                   "(trim(%s) = '') then 1 else 0 end)/count(1) as `%s`,\n" % \
                   (col_name, col_name, col_name, col_comm)
    sql_str += "count(1) as total_cnt \nfrom %s where %s" %(table_name, partition)
    # print(sql_str)
    print('| Field name | Null rate |\n|----|----|\n')
    statistic = {'0.0': 0, '0.1': 0, '0.2': 0, '0.3': 0, '0.4': 0,
                 '0.5': 0, '0.6': 0, '0.7': 0, '0.8': 0, '0.9': 0, '1.0': 0}
    with o.execute_sql(sql_str).open_reader() as rec:
        rf = rec.to_result_frame()
        n = rf.names
        v = rf.values[0]
        for i in range(len(n) - 1):
            print("|%s|%.2f|" % (n[i], v[i]))
            statistic['%.1f' % v[i]] = statistic['%.1f' % v[i]] + 1
        print(' Total data ', v[-1])
    for i in statistic.keys():
        print(i, statistic[i])

def main():
    check_data_by_execute_sql('table_name', "partition_name")
    check_data_by_execute_sql('table_name', "partition_name")
    check_data_by_execute_sql('table_name', "partition_name")

if __name__ == '__main__':
    main()

The threshold accuracy is analyzed by comparing the code table

#  brief introduction : spell  SQL  Screening code table comparison 
#  author : Wang Zhendong 
#  date :2021-01-07
"""
 Import the dictionary of a field into the standard dictionary table template , Dictionary table structure :
    create table dim_gjlcb2d0_zdbm_all (
        code string,
        name string)
    comment ' Diagnostic code table ';
    create table dim_gjlcb2d0_ssbm_all (
        code string,
        name string)
    comment ' Operation code table ';
 Statistical content : Code table comparison failed  name/code  Yes , And the corresponding number of data pieces 
"""

"""
var:
    code_field  Code fields 
    name_field  Name field 
    code_ch  Code field comments 
    name_ch  Name field comment 
    table_name  The name of the table 
    dim_table  Dictionary table name 
    partition  Specify the data partition 
"""
def fill_sql(var):
    code_field, name_field, code_ch, name_ch, table_name, dim_table, partition = var
    sql_model = """
--  Abnormal Statistics 
select '%s_%s',count(1)  Incomparable name_code logarithm ,sum(cnt)  Number of data involved 
from (select code, name, cnt 
      from (select %s code, %s name,count(1) cnt from %s 
            where %s and %s is not null and %s is not null and %s !='-' and %s != '-' group by %s,%s) t_1  --  Filter out the null value 
      left anti join %s t_2
      on t_1.code = t_2.code and t_1.name = t_2.name) t_3;
"""
    detail_sql_model = """
--  Exception details 
select code %s, name %s, cnt  Data volume 
from (select %s code, %s name,count(1) cnt from %s 
      where %s and %s is not null and %s is not null and %s !='-' and %s != '-' group by %s,%s) t_1  --  Filter out the null value 
left anti join %s t_2
on t_1.code = t_2.code and t_1.name = t_2.name;
"""
    # code_ch, name_ch
    return sql_model % (code_ch, name_ch, code_field, name_field, table_name, partition, code_field, name_field,
                               code_field, name_field, code_field, name_field, dim_table)

def main():
    var_list = [('c', 'n', ' Diagnostic code ', ' Diagnosis name ', 'xxxxxxxxx',
                 'dim_gjlcb2d0_zdbm_all', "partition'")]
    for v in var_list:
        print(fill_sql(v))

if __name__ == '__main__':
    main()

Time saturation

--  Be careful : If there is dirty data in the table ( For example, the date is  9999-01-01 00:00:00  Time for , Need to increase the  where  Conditions filter out )
--  View the time interval of the data 
select 
	max(time_field),min(time_field) 
from table_name
where pt = 'partition_name'; 
--  Duration of Statistics ( Day level )
select 
	datediff(max(time_field),min(time_field),'dd')  The total number of days the data lasts 
from table_name
where pt = 'partition_name'
  and time_field between  ' constraint condition ' and ' constraint condition '; 
--  The day level time saturation of the organization dimension :  organization , Number of days with data , Time saturation , Total data 
select org_code  organization , count(1)  Number of days with data , count(1)/ The total number of days the data lasts   Time saturation , sum(cnt)  Total data 
from (
	select 
		org_code, to_char(time_field, 'yyyy-mm-dd') dd, count(1) cnt
	from table_name
	where pt = 'partition_name'
	  and time_field between  ' constraint condition ' and ' constraint condition '
	group by org_code, to_char(time_field, 'yyyy-mm'))
where cnt > 0
group by org_code;

copyright notice
author[Broad starfish sink],Please bring the original link to reprint, thank you.
https://en.pythonmana.com/2022/131/202205110610439334.html

Random recommended