diff --git a/README.md b/README.md index 7b71a70d1924aaa29ed1578945bffdb3dc77eb14..ef6fb8de26ba15bb7f2ed8a7cac30766cadfe7f5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,30 @@ # JSON filtertree -A Python script to load and filter a JSON file, showing the nested-key structure with regex filtering. \ No newline at end of file +A Python script to load and filter a JSON file, showing the nested-key structure with regex filtering. + +## Usage: + +``` + json_filtertree <JSONfile> [...filters...] +``` + + +The first argument should be the JSON filename. + +Following arguments are optional, and are one or more of: + ++regex : any key in the path must match the regex (like "grep") +-regex : no key in the path can match the regex (like "grep -v") +$regex : the final key in the path must match the regex (note: you may have to + escape the $ in your terminal) +^regex : the first key in the path must match the regex + + + +Requires various standard Python modules that, if not available, are easily +installed with pip. Requires Python 3 and initially tested on 3.8 and 3.9. + + + +Developed for use with binary_c-python (https://gitlab.eps.surrey.ac.uk/ri0005/binary_c-python) but probably useful elsewhere. +(c) Robert Izzard 2021 diff --git a/json_filtertree b/json_filtertree new file mode 100755 index 0000000000000000000000000000000000000000..5a4caf0b91a60237c93094639075fa0ab353b668 --- /dev/null +++ b/json_filtertree @@ -0,0 +1,377 @@ +#!/usr/bin/python3 + +import bz2 +import collections +import gzip +import inspect +import json +import msgpack +import os +import re +import simplejson +import sys +from halo import Halo + +############################################################ +# +# script to show JSON tree-branch information, with filters. +# +# The first argument should be the JSON filename. +# +# Following arguments are optional, and are one or more of: +# +# +regex : any key in the path must match the regex (like "grep") +# -regex : no key in the path can match the regex (like "grep -v") +# $regex : the final key in the path must match the regex (note: you may have to +# escape the $ in your terminal) +# ^regex : the first key in the path must match the regex +# +# Developed for use with binary_c-python but probably useful elsewhere. +# (c) Robert Izzard 2021 +# +############################################################ + +options = { + # if suppress is True (the default), lines which start with the same data + # are compacted into the single (longer) line, so for example the following + # two lines: + # + # a,b + # a,b,c + # + # would only output + # + # a,b,c + 'suppress' : True, + + # output key list separator + 'separator' : ' : ', + + # float list designator : a list of float keys is replaced by this + 'float list designator' : 'floats', + + # int list designator : a list of float keys is replaced by this + 'int list designator' : 'ints', + + } + +def file_compression(filename): + """ + Return the compression type of the ensemble file, based on its filename extension. + """ + if filename.endswith(".bz2"): + return "bzip2" + elif filename.endswith(".gz"): + return "gzip" + else: + return None + +def file_type(filename): + """ + Returns the file type of a JSON file. + """ + if ".json" in filename: + filetype = "JSON" + elif ".msgpack" in filename: + filetype = "msgpack" + else: + filetype = None + return filetype + +def open_json(filename,encoding='utf-8'): + """ + Function to open JSON at filename for reading and decompression if required. + """ + compression = file_compression(filename) + if file_type(filename) == "msgpack": + flags = "rb" + else: + flags = "rt" + if compression == "bzip2": + file_object = bz2.open(filename, flags, encoding=encoding) + elif compression == "gzip": + file_object = gzip.open(filename, flags, encoding=encoding) + else: + file_object = open(filename, flags, encoding=encoding) + return file_object + +def keys_to_floats(json_data): + # assumes nested dicts ... + # new_data = {} + + # but this copies the variable type, but has some + # pointless copying + # new_data = copy.copy(json_data) + # new_data.clear() + + # this adopts the type correctly *and* is fast + new_data = type(json_data)() + + for k, v in json_data.items(): + # convert key to a float, if we can + # otherwise leave as is + try: + newkey = float(k) + except: + newkey = k + + # act on value(s) + if isinstance(v, list): + # list data + new_data[newkey] = [ + keys_to_floats(item) + if isinstance(item, collections.abc.Mapping) + else item + for item in v + ] + elif isinstance(v, collections.abc.Mapping): + # dict, ordereddict, etc. data + new_data[newkey] = keys_to_floats(v) + else: + # assume all other data are scalars + new_data[newkey] = v + + return new_data + + +def get_size(obj, seen=None): + """Recursively finds size of objects in bytes""" + size = sys.getsizeof(obj) + if seen is None: + seen = set() + obj_id = id(obj) + if obj_id in seen: + return 0 + # Important mark as seen *before* entering recursion to gracefully handle + # self-referential objects + seen.add(obj_id) + if hasattr(obj, '__dict__'): + for cls in obj.__class__.__mro__: + if '__dict__' in cls.__dict__: + d = cls.__dict__['__dict__'] + if inspect.isgetsetdescriptor(d) or inspect.ismemberdescriptor(d): + size += get_size(obj.__dict__, seen) + break + if isinstance(obj, dict): + size += sum((get_size(v, seen) for v in obj.values())) + size += sum((get_size(k, seen) for k in obj.keys())) + elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)): + size += sum((get_size(i, seen) for i in obj)) + + if hasattr(obj, '__slots__'): # can have __slots__ with __dict__ + size += sum(get_size(getattr(obj, s), seen) for s in obj.__slots__ if hasattr(obj, s)) + + return size + + +def load_json_data(json_filename): + with Halo(text="Loading", interval=250, spinner="moon", color="yellow"): + _loaded = False + def _hook(obj): + nonlocal _loaded + if _loaded == False: + _loaded = True + print( + "Loaded {} data from {}".format(filetype,json_filename), + flush=True, + ) + return obj + + file_object = open_json(json_filename) + filetype = file_type(json_filename) + if filetype == "JSON": + data = simplejson.load(file_object, object_hook=_hook) + elif filetype == "msgpack": + data = msgpack.load(file_object, object_hook=_hook) + + data = keys_to_floats(data) + print("Data size {:5.2f} MByte".format(get_size(data)/(1024*1024))) + return data + +def is_float_list(list): + """ + Return True if list contains only floats, otherwise false. + """ + for x in list: + if not isinstance(x,float): + return False + return True + +def is_int_list(list): + """ + Return True if list contains only ints, otherwise false. + """ + for x in list: + if not isinstance(x,int): + return False + return True + +def is_number_list(list): + + """ + Return True if list contains only floats or ints, otherwise false. + """ + for x in list: + if not isinstance(x,(float,int)): + return False + return True + +prevout = [] +cache = [] +prevoutdict = {} +def parse(data, path=None): + """ + Parse the data recursively, outputting trees when required + """ + global prevout + global cache + if path is None: + path = { + "raw" : [], + "pretty" : [] + } + + if isinstance(data,list): + # data == list + for x in data: + parse(x) + + elif isinstance(data,collections.abc.Mapping): + # data == dict + keys = data.keys() + + # replace the keylist with a string if required + if is_float_list(keys): + path['pretty'].append(options['float list designator']) + replaced_keys = True + elif is_int_list(keys): + path['pretty'].append(options['int list designator']) + replaced_keys = True + else: + replaced_keys = False + + # loop over real keys + for x in keys: + + # we must match a regex to strings, so convert + # if required. + if not isinstance(x,str): + xstring = str(x) + else: + xstring = x + + if regex_compiled['skiplist'] and \ + regex_compiled['skiplist'].search(xstring): + # if in skiplist, skip + continue + else: + path['raw'].append(xstring) + if not replaced_keys: + path['pretty'].append(xstring) + + # if first in path is not in the startswith list, skip + if regex_compiled['startswith'] and \ + not regex_compiled['startswith'].search(path['pretty'][0]): + path['raw'].pop() + if not replaced_keys: + path['pretty'].pop() + continue + + if not path['pretty'] == prevout: + t = tuple(path['pretty']) + + # only output more if we haven't already (shouldn't happen!) + if not t in prevoutdict: + outstring = options['separator'].join(path['pretty']) + + # defaul to the data being ok to output + data_ok = True + + # if we have a greplist, all greplists must match at least one + # string in the path + for regex in regex_compiled['greplist']: + if not any(regex.search(item) for item in path['pretty']): + data_ok = False + + # if we have an endlist, our final item in the path should match it + if regex_compiled['endswith'] and \ + not regex_compiled['endswith'].search(path['pretty'][-1]): + data_ok = False + + if data_ok: + if options['suppress']: + if cache and outstring.startswith(cache[-1]): + cache[-1] = outstring + else: + parse_flush() + cache = [outstring] + else: + print(outstring) + + prevoutdict[t] = True + + # set prevout so we don't output two identical + # lines in a row: note this is a tuple as it's never changed + prevout = tuple(path['pretty']) + + parse(data[x],path) + path['raw'].pop() + if not replaced_keys: + path['pretty'].pop() + + if replaced_keys: + path['pretty'].pop() + else: + # other (assume scalar/string) + pass + +def parse_flush(): + global cache + for line in cache: + print(line) + cache = [] + +############################################################ +# parse command line args +json_filename = sys.argv[1] +greplist = [] +skiplist = [] +startswithlist = [] +endswithlist = [] +for s in sys.argv[2:]: + if s.startswith('--'): + # options + pass + elif s.startswith('-'): + skiplist.append(s[1:]) + elif s.startswith('+'): + greplist.append(s[1:]) + elif s.startswith('$'): + endswithlist.append(s[1:]) + elif s.startswith('^'): + startswithlist.append('^' + s[1:]) + +# pre-compile regular expressions +regex = { + 'startswith' : ('(?:%s)' % '|'.join(startswithlist)) if startswithlist else None, + 'endswith' : ('(?:%s)' % '|'.join(endswithlist)) if endswithlist else None, + 'skiplist' : ('(?:%s)' % '|'.join(skiplist)) if skiplist else None, +} +regex_compiled = { + 'skiplist' : None, + 'endswith' : None, + 'startswith' : None, + 'greplist' : [] +} + +for grep in greplist: + regex_compiled['greplist'].append(re.compile(grep)) +for r in regex: + if r and regex[r]: + regex_compiled[r] = re.compile(regex[r]) + +# load data and parse it +parse(load_json_data(json_filename)) +parse_flush() + +# done