Skip to content
Snippets Groups Projects
Commit 98c8398f authored by Izzard, Robert Dr (Maths & Physics)'s avatar Izzard, Robert Dr (Maths & Physics)
Browse files

add script

parent a1160da2
No related branches found
No related tags found
No related merge requests found
# JSON filtertree
A Python script to load and filter a JSON file, showing the nested-key structure with regex filtering.
\ No newline at end of file
A Python script to load and filter a JSON file, showing the nested-key structure with regex filtering.
## Usage:
```
json_filtertree <JSONfile> [...filters...]
```
The first argument should be the JSON filename.
Following arguments are optional, and are one or more of:
+regex : any key in the path must match the regex (like "grep")
-regex : no key in the path can match the regex (like "grep -v")
$regex : the final key in the path must match the regex (note: you may have to
escape the $ in your terminal)
^regex : the first key in the path must match the regex
Requires various standard Python modules that, if not available, are easily
installed with pip. Requires Python 3 and initially tested on 3.8 and 3.9.
Developed for use with binary_c-python (https://gitlab.eps.surrey.ac.uk/ri0005/binary_c-python) but probably useful elsewhere.
(c) Robert Izzard 2021
#!/usr/bin/python3
import bz2
import collections
import gzip
import inspect
import json
import msgpack
import os
import re
import simplejson
import sys
from halo import Halo
############################################################
#
# script to show JSON tree-branch information, with filters.
#
# The first argument should be the JSON filename.
#
# Following arguments are optional, and are one or more of:
#
# +regex : any key in the path must match the regex (like "grep")
# -regex : no key in the path can match the regex (like "grep -v")
# $regex : the final key in the path must match the regex (note: you may have to
# escape the $ in your terminal)
# ^regex : the first key in the path must match the regex
#
# Developed for use with binary_c-python but probably useful elsewhere.
# (c) Robert Izzard 2021
#
############################################################
options = {
# if suppress is True (the default), lines which start with the same data
# are compacted into the single (longer) line, so for example the following
# two lines:
#
# a,b
# a,b,c
#
# would only output
#
# a,b,c
'suppress' : True,
# output key list separator
'separator' : ' : ',
# float list designator : a list of float keys is replaced by this
'float list designator' : 'floats',
# int list designator : a list of float keys is replaced by this
'int list designator' : 'ints',
}
def file_compression(filename):
"""
Return the compression type of the ensemble file, based on its filename extension.
"""
if filename.endswith(".bz2"):
return "bzip2"
elif filename.endswith(".gz"):
return "gzip"
else:
return None
def file_type(filename):
"""
Returns the file type of a JSON file.
"""
if ".json" in filename:
filetype = "JSON"
elif ".msgpack" in filename:
filetype = "msgpack"
else:
filetype = None
return filetype
def open_json(filename,encoding='utf-8'):
"""
Function to open JSON at filename for reading and decompression if required.
"""
compression = file_compression(filename)
if file_type(filename) == "msgpack":
flags = "rb"
else:
flags = "rt"
if compression == "bzip2":
file_object = bz2.open(filename, flags, encoding=encoding)
elif compression == "gzip":
file_object = gzip.open(filename, flags, encoding=encoding)
else:
file_object = open(filename, flags, encoding=encoding)
return file_object
def keys_to_floats(json_data):
# assumes nested dicts ...
# new_data = {}
# but this copies the variable type, but has some
# pointless copying
# new_data = copy.copy(json_data)
# new_data.clear()
# this adopts the type correctly *and* is fast
new_data = type(json_data)()
for k, v in json_data.items():
# convert key to a float, if we can
# otherwise leave as is
try:
newkey = float(k)
except:
newkey = k
# act on value(s)
if isinstance(v, list):
# list data
new_data[newkey] = [
keys_to_floats(item)
if isinstance(item, collections.abc.Mapping)
else item
for item in v
]
elif isinstance(v, collections.abc.Mapping):
# dict, ordereddict, etc. data
new_data[newkey] = keys_to_floats(v)
else:
# assume all other data are scalars
new_data[newkey] = v
return new_data
def get_size(obj, seen=None):
"""Recursively finds size of objects in bytes"""
size = sys.getsizeof(obj)
if seen is None:
seen = set()
obj_id = id(obj)
if obj_id in seen:
return 0
# Important mark as seen *before* entering recursion to gracefully handle
# self-referential objects
seen.add(obj_id)
if hasattr(obj, '__dict__'):
for cls in obj.__class__.__mro__:
if '__dict__' in cls.__dict__:
d = cls.__dict__['__dict__']
if inspect.isgetsetdescriptor(d) or inspect.ismemberdescriptor(d):
size += get_size(obj.__dict__, seen)
break
if isinstance(obj, dict):
size += sum((get_size(v, seen) for v in obj.values()))
size += sum((get_size(k, seen) for k in obj.keys()))
elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
size += sum((get_size(i, seen) for i in obj))
if hasattr(obj, '__slots__'): # can have __slots__ with __dict__
size += sum(get_size(getattr(obj, s), seen) for s in obj.__slots__ if hasattr(obj, s))
return size
def load_json_data(json_filename):
with Halo(text="Loading", interval=250, spinner="moon", color="yellow"):
_loaded = False
def _hook(obj):
nonlocal _loaded
if _loaded == False:
_loaded = True
print(
"Loaded {} data from {}".format(filetype,json_filename),
flush=True,
)
return obj
file_object = open_json(json_filename)
filetype = file_type(json_filename)
if filetype == "JSON":
data = simplejson.load(file_object, object_hook=_hook)
elif filetype == "msgpack":
data = msgpack.load(file_object, object_hook=_hook)
data = keys_to_floats(data)
print("Data size {:5.2f} MByte".format(get_size(data)/(1024*1024)))
return data
def is_float_list(list):
"""
Return True if list contains only floats, otherwise false.
"""
for x in list:
if not isinstance(x,float):
return False
return True
def is_int_list(list):
"""
Return True if list contains only ints, otherwise false.
"""
for x in list:
if not isinstance(x,int):
return False
return True
def is_number_list(list):
"""
Return True if list contains only floats or ints, otherwise false.
"""
for x in list:
if not isinstance(x,(float,int)):
return False
return True
prevout = []
cache = []
prevoutdict = {}
def parse(data, path=None):
"""
Parse the data recursively, outputting trees when required
"""
global prevout
global cache
if path is None:
path = {
"raw" : [],
"pretty" : []
}
if isinstance(data,list):
# data == list
for x in data:
parse(x)
elif isinstance(data,collections.abc.Mapping):
# data == dict
keys = data.keys()
# replace the keylist with a string if required
if is_float_list(keys):
path['pretty'].append(options['float list designator'])
replaced_keys = True
elif is_int_list(keys):
path['pretty'].append(options['int list designator'])
replaced_keys = True
else:
replaced_keys = False
# loop over real keys
for x in keys:
# we must match a regex to strings, so convert
# if required.
if not isinstance(x,str):
xstring = str(x)
else:
xstring = x
if regex_compiled['skiplist'] and \
regex_compiled['skiplist'].search(xstring):
# if in skiplist, skip
continue
else:
path['raw'].append(xstring)
if not replaced_keys:
path['pretty'].append(xstring)
# if first in path is not in the startswith list, skip
if regex_compiled['startswith'] and \
not regex_compiled['startswith'].search(path['pretty'][0]):
path['raw'].pop()
if not replaced_keys:
path['pretty'].pop()
continue
if not path['pretty'] == prevout:
t = tuple(path['pretty'])
# only output more if we haven't already (shouldn't happen!)
if not t in prevoutdict:
outstring = options['separator'].join(path['pretty'])
# defaul to the data being ok to output
data_ok = True
# if we have a greplist, all greplists must match at least one
# string in the path
for regex in regex_compiled['greplist']:
if not any(regex.search(item) for item in path['pretty']):
data_ok = False
# if we have an endlist, our final item in the path should match it
if regex_compiled['endswith'] and \
not regex_compiled['endswith'].search(path['pretty'][-1]):
data_ok = False
if data_ok:
if options['suppress']:
if cache and outstring.startswith(cache[-1]):
cache[-1] = outstring
else:
parse_flush()
cache = [outstring]
else:
print(outstring)
prevoutdict[t] = True
# set prevout so we don't output two identical
# lines in a row: note this is a tuple as it's never changed
prevout = tuple(path['pretty'])
parse(data[x],path)
path['raw'].pop()
if not replaced_keys:
path['pretty'].pop()
if replaced_keys:
path['pretty'].pop()
else:
# other (assume scalar/string)
pass
def parse_flush():
global cache
for line in cache:
print(line)
cache = []
############################################################
# parse command line args
json_filename = sys.argv[1]
greplist = []
skiplist = []
startswithlist = []
endswithlist = []
for s in sys.argv[2:]:
if s.startswith('--'):
# options
pass
elif s.startswith('-'):
skiplist.append(s[1:])
elif s.startswith('+'):
greplist.append(s[1:])
elif s.startswith('$'):
endswithlist.append(s[1:])
elif s.startswith('^'):
startswithlist.append('^' + s[1:])
# pre-compile regular expressions
regex = {
'startswith' : ('(?:%s)' % '|'.join(startswithlist)) if startswithlist else None,
'endswith' : ('(?:%s)' % '|'.join(endswithlist)) if endswithlist else None,
'skiplist' : ('(?:%s)' % '|'.join(skiplist)) if skiplist else None,
}
regex_compiled = {
'skiplist' : None,
'endswith' : None,
'startswith' : None,
'greplist' : []
}
for grep in greplist:
regex_compiled['greplist'].append(re.compile(grep))
for r in regex:
if r and regex[r]:
regex_compiled[r] = re.compile(regex[r])
# load data and parse it
parse(load_json_data(json_filename))
parse_flush()
# done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment