import arcpy, traceback, sys, os, time
from dateutil.parser import parse
arcpy.env.overwriteOutput = True
def add_fields(Input_Layer, field_name_list):
'''Quick, hard-coded way to add fields to a table,
example below shows one way to manage field names for easier coding.
Create the field name list in your main code, then paste below for reference.
# NAMES FOR FIELDS TO ADD
# 0 1 2 3 4 5 6 7 8
tableheaders = [w_name, d_type, tab_name, col_name, col_type, col_width, col_prec, col_alias, col_join] - 9 cols
doCounts = True
tableheaders.append(col_count) # 9 - 8 cols
tableheaders.append(col_pc) # 10
tableheaders.append(contains_zeros) # 11
tableheaders.append(max_len) # 12
tableheaders.append(min_val) # 13
tableheaders.append(max_val) # 14
tableheaders.append(min_date_col) # 15
tableheaders.append(max_date_col) # 16
doDomains = True
tableheaders.append(dom_flag) # 17 - 4 cols
tableheaders.append(col_domain) # 18
tableheaders.append(dom_col_type) # 19
tableheaders.append(dom_typ_col) # 20
Create table syntax: CreateTable_management (out_path, out_name, {template}, {config_keyword})
'''
# CREATE VALUE TABLE FOR FIELDS TO ADD
# ADDROW FORMAT: Name Type Length - Length is optional
vtab = arcpy.ValueTable(3)
vtab.addRow(field_name_list[0] + " text 255")
vtab.addRow(field_name_list[1] + " text 15")
vtab.addRow(field_name_list[2] + " text 100")
vtab.addRow(field_name_list[3] + " text 100")
vtab.addRow(field_name_list[4] + " text 15")
vtab.addRow(field_name_list[5] + " Double")
vtab.addRow(field_name_list[6] + " Double")
vtab.addRow(field_name_list[7] + " text 50")
vtab.addRow(field_name_list[8] + " text 255")
if len(field_name_list) in (17, 21):
vtab.addRow(field_name_list[9] + " Double")
vtab.addRow(field_name_list[10] + " Double")
vtab.addRow(field_name_list[11] + " text 5")
vtab.addRow(field_name_list[12] + " Double")
vtab.addRow(field_name_list[13] + " Double")
vtab.addRow(field_name_list[14] + " Double")
vtab.addRow(field_name_list[15] + " date")
vtab.addRow(field_name_list[16] + " date")
if len(field_name_list) == 21:
vtab.addRow(field_name_list[17] + " text 5")
vtab.addRow(field_name_list[18] + " text 100")
vtab.addRow(field_name_list[19] + " text 15")
vtab.addRow(field_name_list[20] + " text 15")
if len(field_name_list) == 13:
vtab.addRow(field_name_list[9] + " text 5")
vtab.addRow(field_name_list[10] + " text 100")
vtab.addRow(field_name_list[11] + " text 15")
vtab.addRow(field_name_list[12] + " text 15")
x = 0
# LOOP THROUGH THE VALUE TABLE, ADDING FIELDS
while x < vtab.rowCount:
if (vtab.getValue(x, 1) == "text"):
arcpy.AddField_management(Input_Layer, vtab.getValue(x, 0), vtab.getValue(x, 1), field_length=(int(vtab.getValue(x, 2))))
else:
arcpy.AddField_management(Input_Layer, vtab.getValue(x, 0), vtab.getValue(x, 1))
arcpy.AddMessage(" Added field " + vtab.getValue(x, 0) + ".")
x += 1
def dict_domainname_domaincolumntype(input_wksp):
# CREATE DICTIONARY FROM CODED VALUE DOMAINS; DOMAIN NAME IS KEY, DOMAIN VALUES ARE LIST VALUE.
# RETURNS NONE IF WORKSPACE TYPE IS FILE SYSTEM.
desc = arcpy.Describe(input_wksp)
wksp_type = desc.workspaceType
if wksp_type == 'FileSystem':
d = None
else:
d = {}
doms = arcpy.da.ListDomains(input_wksp)
for dom in doms:
d[dom.name] = [dom.type, dom.domainType]
return d
def insert_rows(in_layer, field_list, rows_to_insert):
# INSERTS A PYTHON LIST OF LISTS CONTAINING COMMA-DELIMITED ROWS OF DATA TO INSERT.
# PARAMETERS: LAYER TO INSERT ROWS INTO, LIST OF FIELD NAME STRINGS, DATA ROWS.
cursor = arcpy.da.InsertCursor(in_layer, field_list)
for row in rows_to_insert:
cursor.insertRow(row)
del cursor
def replace_non_alphanum_char_with_other(in_string, replace_string):
# REPLACES NON-ALPHANUMERIC CHARACTERS WITH A REPLACE STRING.
# FOR EXAMPLE, YOU COULD REPLACE ALL THE NON-ALPHANUMERIC CHARACTERS WITH A SINGLE UNDERSCORE.
# ex: change 1001 - Deep Fork Gathering to 1001_Deep_Fork_Gathering
s = in_string
for c in ("!", "@", "^", "+", "=", "|", ";", "<", ">", "~", "`", " ", "-", "(", ")", "#", "?", "%", "&", "*", "{", "}", "[", "]", ".", ":", "\\", "/"):
if c in s:
s = s.replace(c, replace_string)
s = s.replace(replace_string + replace_string, replace_string).replace(replace_string + replace_string, replace_string).replace(replace_string + replace_string, replace_string).replace(replace_string + replace_string, replace_string).replace(replace_string + replace_string, replace_string)
return s
def string_is_date(string):
# DETERMINES IF A STRING COULD BE A DATE
# from dateutil.parser import parse
try:
parse(string)
return True
except ValueError:
return False
def string_is_number(s):
# CHECKS TO SEE IF STRING IS NUMERIC. BOOLEAN.
try:
float(s)
return True
except ValueError:
return False
def string_is_long_or_double(s):
''' RETURNS "LONG" OR "DOUBLE" DEPENDING ON NUMERIC DATA TYPE THAT STRING CONVERTS TO.
ASSUMES STRING HAS BEEN VERIFIED AS NUMERIC.'''
if round(float(s), 0)//float(s) != 1:
return "Double"
else:
return "Long"
def get_distinct_values(in_layer, in_col):
# RETURN A UNIQUE LIST OF VALUES FROM THE SPECIFIED COLUMN
l = [row[0] for row in arcpy.da.SearchCursor(in_layer, in_col)]
list_of_distinct_values = set(l)
return list_of_distinct_values
def table_schema_to_file(tab_path, doDomains, doCounts, dom_dict):
# COLLECTS TABLE SCHEMA AND COMPILES INSERT ROWS
try:
# FIRST, DESCRIBE TABLE AND COLLECT FIELDS.
# ALSO SET INITIAL VARIABLES
# AS WELL A LIST OF WHAT NULL MEANS AND WHETHER TABLE IS EXCEL.
rows_to_add = []
lfields=arcpy.ListFields(tab_path)
d = arcpy.Describe(tab_path)
t = d.baseName
db = d.path
is_excel = False
if str(db)[-4:] in [".xls", "xlsx"]:
t = t.strip("'")
is_excel = True
dtype = d.dataType
nullvalues = ["", " ", None]
totalCount = 0
# ITERATE THROUGH TABLE FIELDS.
for i, lf in enumerate(lfields):
field_type = str(lf.type)
# IF COLUMN HEADER HAS INVALID CHARACTERS, REPLACE WITH UNDERSCORES.
# KEEP ORIGINAL COLUMN NAME AND MAKE THE VALID NAME THE ALIAS NAME.
excel_col_alias = lf.aliasName
invalid_column_name = False
for c in ("!", "@", "^", "+", "=", "|", ";", "<", ">", "~", "`", " ", "-", "(", ")", "#", "?", "%", "&", "*", "{", "}", "[", "]", ".", ":", "/", "\\"):
if c in lf.name:
invalid_column_name = True
excel_col_alias = replace_non_alphanum_char_with_other(lf.name, "_")
# START THE INSERT ROW WITH THE DATA COLLECTED SO FAR.
this_line = []
this_line.append(str(db))
this_line.append(str(dtype))
this_line.append(str(t))
this_line.append(str(lf.name))
this_line.append(field_type)
this_line.append(float(lf.length))
this_line.append(float(lf.precision))
this_line.append(excel_col_alias)
this_line.append(t.lower() + "_" + lf.name.lower())
if (doCounts == True): # IF DOING COLUMN COUNT STATS...
# SET INITIAL VARIABLES
rowCount = 0
hasZeros = False
minNum = 0
maxNum = 0
colLen = 0
pcPop = 0
maxDate = None
minDate = None
is_double = True
is_long = True
column_exists = False
first_string = None
is_date = True
is_guid = True
# ITERATE THROUGH EACH ROW OF THE INPUT TABLE.
with arcpy.da.SearchCursor(tab_path, "*") as s_cur:
for r in s_cur:
v = r[i] # REMEMBER WE ARE ALSO STILL ITERATING THROUGH
# EACH FIELD. SO, WE ARE COLLECTING THE VALUE
# FROM THE FIELD AT HAND.
# IF VALUE IS NOT NULL, MODIFY VARS AS APPROPRIATE.
if not(v in nullvalues):
column_exists = True
rowCount += 1
if lf.type in ("Integer", "Double", "OID"):
if v == 0:
hasZeros = True
if minNum == 0 or v < minNum:
minNum = v
if maxNum == 0 or v > maxNum:
maxNum = v
if lf.type == "String":
if v == "0":
hasZeros = True
if colLen == 0 or len(v) > colLen:
colLen = len(v)
if lf.type == "Date":
if maxDate is None or v > maxDate:
maxDate = v
if minDate is None or v < minDate:
minDate = v
# THE EXCEL SECTION IS SO WE CAN SKIP EMPTY SHEETS,
# AS WELL AS IDENTIFY COLUMN TYPES MORE SPECIFICALLY
# THAN ARCGIS DOES OUT OF THE BOX.
if is_excel == True:
if lf.type == "String":
if v == "0":
hasZeros = True
if string_is_number(v):
if minNum == 0 or float(v) < minNum:
minNum = float(v)
if maxNum == 0 or float(v) > maxNum:
maxNum = float(v)
the_type = string_is_long_or_double(str(v))
if the_type == "Double":
is_long = False
is_date = False
is_guid = False
if the_type == "Long":
is_date = False
is_guid = False
else:
if lf.name[0:1] == "F" and first_string is None and lf.name[1:2].isdigit() == True:
first_string = str(v)
excel_col_alias = replace_non_alphanum_char_with_other(str(v), "_")
rowCount -= 1
else:
is_double = False
is_long = False
if v[0:1] == "{" and v[-1:] == "}":
pass
else:
is_guid = False
if string_is_date(v) == False:
is_date = False
if string_is_date(v) == True:
if maxDate is None or v > maxDate:
maxDate = v
if minDate is None or v < minDate:
minDate = v
elif lf.type == "Double":
num_type = string_is_long_or_double(str(v))
if num_type == "Double":
is_long = False
is_date = False
is_guid = False
elif lf.type == "Date":
is_long = False
is_double = False
is_guid = False
else:
pass
else:
is_long = False
is_double = False
is_date = False
is_guid = False
if lf.type == "OID":
totalCount = rowCount
if is_excel == True:
# USE THE HIGHEST COLUMN COUNT AS THE RECORD COUNT.
# THIS WILL NOT BE EXACT IF THERE ARE TITLES/TOTALS.
if rowCount > totalCount:
totalCount = rowCount
# APPEND THE COUNT STATS TO THE INSERT ROW.
this_line.append(float(rowCount))
this_line.append(float(pcPop))
this_line.append(str(hasZeros))
this_line.append(float(colLen))
this_line.append(float(minNum))
this_line.append(float(maxNum))
this_line.append(minDate)
this_line.append(maxDate)
# UPDATE THE EXCEL VALUES TO BE MORE SPECIFIC.
# HAVE TO WAIT UNTIL THE END BECAUSE NEED TO ITERATE ENTIRE COLUMN.
if is_excel == True:
if is_double == True:
this_line[4] = "Double"
this_line[5] = None
this_line[6] = None
this_line[12] = None
if is_long == True:
this_line[4] = "Long"
this_line[5] = None
this_line[6] = None
this_line[12] = None
if is_date == True:
this_line[4] = "Date"
this_line[5] = None
this_line[6] = None
this_line[12] = None
if is_guid == True:
this_line[4] = "GUID"
this_line[5] = None
this_line[6] = None
this_line[12] = None
if rowCount == 0:
this_line[4] = "String"
this_line[5] = None
this_line[6] = None
this_line[12] = None
if excel_col_alias is not None:
this_line[7] = excel_col_alias
if (doDomains == True): # IF DOING DOMAIN DATA...
sDomain = ""
sFlag = ""
sType = ""
sDType = ""
# IF THIS FIELD IS ASSOCIATED WITH A DOMAIN AND THE DOMAIN
# EXISTS IN THE PASSED DICTIONARY, APPEND THE INFO TO THE
# INSERT ROW.
if not(dom_dict is None):
if not(lf.domain is None):
sDomain = lf.domain
sFlag = "No"
if sDomain in dom_dict:
sType = dom_dict[sDomain][0]
sDType = dom_dict[sDomain][1]
sFlag = "Yes"
this_line.append(sFlag)
this_line.append(str(sDomain))
this_line.append(str(sType))
this_line.append(str(sDType))
if column_exists == True:
rows_to_add.append(this_line)
# IF THE TABLE IS POPULATED, UPDATE THE PERCENTAGE POPULATED COLUMN FOR
# EACH FIELD BASED ON THE FINAL RECORD COUNT.
# USE 2 DECIMAL PLACES TO SUPPORT LARGE TABLES WITH ONLY A FEW RECS POPULATED.
if totalCount == 0: # 9, 10
pass
else:
for row in rows_to_add:
row[10] = round((float(row[9])/float(totalCount))*100, 2)
return rows_to_add
except:
tb = sys.exc_info()[2]
tbinfo = traceback.format_tb(tb)[0]
pymsg = tbinfo + "\n" + str(sys.exc_type)+ ": " + str(sys.exc_value)
arcpy.AddError("Python Messages: " + pymsg + " GP Messages: " + arcpy.GetMessages(2))
if __name__ == "__main__":
in_workspace = arcpy.GetParameterAsText(0) # WORKSPACE
incl_domains = arcpy.GetParameterAsText(1) # BOOLEAN, DEFAULT True
incl_counts = arcpy.GetParameterAsText(2) # BOOLEAN, DEFAULT True
prefix_chosen = arcpy.GetParameterAsText(3) # STRING, DEFAULT Other
out_workspace = arcpy.GetParameterAsText(4) # WORKSPACE
out_table_name = arcpy.GetParameterAsText(5) # STRING
# CREATE COLUMN NAMES
if prefix_chosen == "For ETL Source":
n_prefix = "source_"
s_prefix = "src_"
n_suffix = ""
elif prefix_chosen == "For ETL Target":
n_prefix = "target_"
s_prefix = "targ_"
n_suffix = ""
else:
n_prefix = ""
s_prefix = ""
n_suffix = "_name"
w_name = n_prefix + "workspace"
d_type = n_prefix + "data_type"
tab_name = n_prefix + "table" + n_suffix
col_name = n_prefix + "column" + n_suffix
col_type = s_prefix + "col_type"
col_width = s_prefix + "col_width"
col_prec = s_prefix + "col_prec"
col_alias = s_prefix + "col_alias"
col_count = s_prefix + "col_cnt"
col_pc = s_prefix + "col_pc_pop"
contains_zeros = s_prefix + "col_has_zeros"
max_len = s_prefix + "col_len"
min_val = s_prefix + "col_min"
max_val = s_prefix + "col_max"
min_date_col = s_prefix + "date_min"
max_date_col = s_prefix + "date_max"
dom_flag = s_prefix + "domain_flag"
col_domain = n_prefix + "domain_name"
dom_col_type = s_prefix + "domain_col_type"
col_join = s_prefix + "join_field"
dom_typ_col = s_prefix + "domain_type"
try:
arcpy.AddMessage(" ")
# GET OBJECTS PRESENT IN THE INPUT WORKSPACE
arcpy.env.workspace = in_workspace
# COLLECT FEATURE CLASSES FROM WORKSPACE TREE
FCs = []
walk = arcpy.da.Walk(in_workspace, datatype='FeatureClass')
for dirpath, dirnames, filenames in walk:
for filename in filenames:
FCs.append(os.path.join(dirpath, filename))
# COLLECT TABLES FROM WORKSPACE TREE
tables = []
walk = arcpy.da.Walk(in_workspace, datatype='Table')
for dirpath, dirnames, filenames in walk:
for filename in filenames:
tables.append(os.path.join(dirpath, filename))
# CREATE OUTPUT TABLE
out_table = arcpy.CreateTable_management (out_workspace, out_table_name)
# CREATE THE COLUMN HEADINGS LIST
# 0 1 2 3 4 5 6 7 8
tableheaders = [w_name, d_type, tab_name, col_name, col_type, col_width, col_prec, col_alias, col_join]
# IF USER SPECIFIED INCLUDING COLUMN COUNT STATISTICS, ADD THOSE COLUMN HEADINGS
if (str(incl_counts).lower() == "true"):
doCounts = True
tableheaders.append(col_count) # 9
tableheaders.append(col_pc) # 10
tableheaders.append(contains_zeros) # 11
tableheaders.append(max_len) # 12
tableheaders.append(min_val) # 13
tableheaders.append(max_val) # 14
tableheaders.append(min_date_col) # 15
tableheaders.append(max_date_col) # 16
else:
doCounts = False
# IF USER SPECIFIED INCLUDING DOMAIN STATISTICS, ADD THOSE COLUMN HEADINGS
if (str(incl_domains).lower() == "true"):
doDomains = True
tableheaders.append(dom_flag) # 17
tableheaders.append(col_domain) # 18
tableheaders.append(dom_col_type) # 19
tableheaders.append(dom_typ_col) # 20
else:
doDomains = False
# ADD THE COLUMN HEADINGS TO THE OUTPUT TABLE
add_fields(out_table, tableheaders)
# GET THE DOMAIN INFO FROM THE GEODATABASE
# (IF THIS IS NOT A GEODATABASE, THIS FUNCTION WILL RETURN NONE)
if doDomains == True:
dom_dict = dict_domainname_domaincolumntype(in_workspace)
else:
dom_dict = ""
arcpy.AddMessage(" Writing workspace schema to file...please wait...")
# LOOP THROUGH FEATURE CLASSES, WRITING SCHEMA TO FILE
for x in FCs:
try:
arcpy.env.workspace = in_workspace
desc = arcpy.Describe(x)
x_path = desc.catalogPath
rows1 = table_schema_to_file(x_path, doDomains, doCounts, dom_dict)
arcpy.AddMessage(" Inserting " + str(len(rows1)) + " rows for FC " + str(x) + "...")
insert_rows(out_table, tableheaders, rows1)
del rows1
except:
tb = sys.exc_info()[2]
tbinfo = traceback.format_tb(tb)[0]
pymsg = tbinfo + "\n" + str(sys.exc_type)+ ": " + str(sys.exc_value)
arcpy.AddError("Python Messages: " + pymsg + " GP Messages: " + arcpy.GetMessages(2))
continue
# LOOP THROUGH TABLES, WRITING SCHEMA TO FILE
for x in tables:
try:
rows2 = []
arcpy.env.workspace = in_workspace
desc = arcpy.Describe(x)
x_path = desc.catalogPath
rows2 = table_schema_to_file(x_path, doDomains, doCounts, dom_dict)
arcpy.AddMessage(" Inserting " + str(len(rows2)) + " rows for Table " + str(x) + " ...")
insert_rows(out_table, tableheaders, rows2)
del rows2
except:
arcpy.AddMessage("ERROR: " + str(x))
arcpy.AddMessage("ERROR: " + str(rows2))
tb = sys.exc_info()[2]
tbinfo = traceback.format_tb(tb)[0]
pymsg = tbinfo + "\n" + str(sys.exc_type)+ ": " + str(sys.exc_value)
arcpy.AddError("Python Messages: " + pymsg + " GP Messages: " + arcpy.GetMessages(2))
continue
# OUTPUT PARAMETER
arcpy.SetParameter(6, out_table) # TABLEVIEW
arcpy.AddMessage(" ")
except:
tb = sys.exc_info()[2]
tbinfo = traceback.format_tb(tb)[0]
pymsg = tbinfo + "\n" + str(sys.exc_type)+ ": " + str(sys.exc_value)
arcpy.AddError("Python Messages: " + pymsg + " GP Messages: " + arcpy.GetMessages(2))
finally:
del in_workspace, incl_domains, incl_counts, out_workspace, out_table_name, prefix_chosen
del out_table
del tableheaders, dom_dict
del tables, FCs