add script

98c8398f · Izzard, Robert Dr (Maths & Physics) · a1160da2 · 98c8398f · 98c8398f
Commit 98c8398f authored 3 years ago by Izzard, Robert Dr (Maths & Physics)
--- a/README.md
+++ b/README.md
 # JSON filtertree

-A Python script to load and filter a JSON file, showing the nested-key structure with regex filtering.
\ No newline at end of file
+A Python script to load and filter a JSON file, showing the nested-key structure with regex filtering.
+
+## Usage:
+
+```
+    json_filtertree <JSONfile> [...filters...]
+```
+
+
+The first argument should be the JSON filename.
+
+Following arguments are optional, and are one or more of:
+
+regex : any key in the path must match the regex (like "grep")
+-regex : no key in the path can match the regex (like "grep -v")
+$regex : the final key in the path must match the regex (note: you may have to
+          escape the $ in your terminal)
+^regex : the first key in the path must match the regex
+
+
+
+Requires various standard Python modules that, if not available, are easily
+installed with pip. Requires Python 3 and initially tested on 3.8 and 3.9.
+
+
+
+Developed for use with binary_c-python (https://gitlab.eps.surrey.ac.uk/ri0005/binary_c-python) but probably useful elsewhere.
+(c) Robert Izzard 2021
--- a/json_filtertree
+++ b/json_filtertree
+#!/usr/bin/python3
+
+import bz2
+import collections
+import gzip
+import inspect
+import json
+import msgpack
+import os
+import re
+import simplejson
+import sys
+from halo import Halo
+
+############################################################
+#
+# script to show JSON tree-branch information, with filters.
+#
+# The first argument should be the JSON filename.
+#
+# Following arguments are optional, and are one or more of:
+#
+# +regex : any key in the path must match the regex (like "grep")
+# -regex : no key in the path can match the regex (like "grep -v")
+# $regex : the final key in the path must match the regex (note: you may have to
+#           escape the $ in your terminal)
+# ^regex : the first key in the path must match the regex
+#
+# Developed for use with binary_c-python but probably useful elsewhere.
+# (c) Robert Izzard 2021
+#
+############################################################
+
+options = {
+    # if suppress is True (the default), lines which start with the same data
+    # are compacted into the single (longer) line, so for example the following
+    # two lines:
+    #
+    # a,b
+    # a,b,c
+    #
+    # would only output
+    #
+    # a,b,c
+    'suppress' : True,
+
+    # output key list separator
+    'separator' : ' : ',
+
+    # float list designator : a list of float keys is replaced by this
+    'float list designator' : 'floats',
+
+    # int list designator : a list of float keys is replaced by this
+    'int list designator' : 'ints',
+
+    }
+
+def file_compression(filename):
+    """
+        Return the compression type of the ensemble file, based on its filename extension.
+            """
+    if filename.endswith(".bz2"):
+        return "bzip2"
+    elif filename.endswith(".gz"):
+        return "gzip"
+    else:
+        return None
+
+def file_type(filename):
+    """
+    Returns the file type of a JSON file.
+    """
+    if ".json" in filename:
+        filetype = "JSON"
+    elif ".msgpack" in filename:
+        filetype = "msgpack"
+    else:
+        filetype = None
+    return filetype
+
+def open_json(filename,encoding='utf-8'):
+    """
+    Function to open JSON at filename for reading and decompression if required.
+    """
+    compression = file_compression(filename)
+    if file_type(filename) == "msgpack":
+        flags = "rb"
+    else:
+        flags = "rt"
+    if compression == "bzip2":
+        file_object = bz2.open(filename, flags, encoding=encoding)
+    elif compression == "gzip":
+        file_object = gzip.open(filename, flags, encoding=encoding)
+    else:
+        file_object = open(filename, flags, encoding=encoding)
+    return file_object
+
+def keys_to_floats(json_data):
+    # assumes nested dicts ...
+    # new_data = {}
+
+    # but this copies the variable type, but has some
+    # pointless copying
+    # new_data = copy.copy(json_data)
+    # new_data.clear()
+
+    # this adopts the type correctly *and* is fast
+    new_data = type(json_data)()
+
+    for k, v in json_data.items():
+        # convert key to a float, if we can
+        # otherwise leave as is
+        try:
+            newkey = float(k)
+        except:
+            newkey = k
+
+        # act on value(s)
+        if isinstance(v, list):
+            # list data
+            new_data[newkey] = [
+                keys_to_floats(item)
+                if isinstance(item, collections.abc.Mapping)
+                else item
+                for item in v
+            ]
+        elif isinstance(v, collections.abc.Mapping):
+            # dict, ordereddict, etc. data
+            new_data[newkey] = keys_to_floats(v)
+        else:
+            # assume all other data are scalars
+            new_data[newkey] = v
+
+    return new_data
+
+
+def get_size(obj, seen=None):
+    """Recursively finds size of objects in bytes"""
+    size = sys.getsizeof(obj)
+    if seen is None:
+        seen = set()
+    obj_id = id(obj)
+    if obj_id in seen:
+        return 0
+    # Important mark as seen *before* entering recursion to gracefully handle
+    # self-referential objects
+    seen.add(obj_id)
+    if hasattr(obj, '__dict__'):
+        for cls in obj.__class__.__mro__:
+            if '__dict__' in cls.__dict__:
+                d = cls.__dict__['__dict__']
+                if inspect.isgetsetdescriptor(d) or inspect.ismemberdescriptor(d):
+                    size += get_size(obj.__dict__, seen)
+                break
+    if isinstance(obj, dict):
+        size += sum((get_size(v, seen) for v in obj.values()))
+        size += sum((get_size(k, seen) for k in obj.keys()))
+    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
+        size += sum((get_size(i, seen) for i in obj))
+        
+    if hasattr(obj, '__slots__'): # can have __slots__ with __dict__
+        size += sum(get_size(getattr(obj, s), seen) for s in obj.__slots__ if hasattr(obj, s))
+        
+    return size
+
+
+def load_json_data(json_filename):
+    with Halo(text="Loading", interval=250, spinner="moon", color="yellow"):
+        _loaded = False
+        def _hook(obj):
+            nonlocal _loaded
+            if _loaded == False:
+                _loaded = True
+                print(
+                    "Loaded {} data from {}".format(filetype,json_filename),
+                    flush=True,
+                )
+            return obj
+
+    file_object = open_json(json_filename)
+    filetype = file_type(json_filename)
+    if filetype == "JSON":
+        data = simplejson.load(file_object, object_hook=_hook)
+    elif filetype == "msgpack":
+        data = msgpack.load(file_object, object_hook=_hook)
+
+    data = keys_to_floats(data)
+    print("Data size {:5.2f} MByte".format(get_size(data)/(1024*1024)))
+    return data
+
+def is_float_list(list):
+    """
+    Return True if list contains only floats, otherwise false.
+    """
+    for x in list:
+        if not isinstance(x,float):
+            return False
+    return True
+
+def is_int_list(list):
+    """
+    Return True if list contains only ints, otherwise false.
+    """
+    for x in list:
+        if not isinstance(x,int):
+            return False
+    return True
+
+def is_number_list(list):
+
+    """
+    Return True if list contains only floats or ints, otherwise false.
+    """
+    for x in list:
+        if not isinstance(x,(float,int)):
+            return False
+    return True
+
+prevout = []
+cache = []
+prevoutdict = {}
+def parse(data, path=None):
+    """
+    Parse the data recursively, outputting trees when required
+    """
+    global prevout
+    global cache
+    if path is None:
+        path = {
+            "raw" : [],
+            "pretty" : []
+            }
+
+    if isinstance(data,list):
+        # data == list
+        for x in data:
+            parse(x)
+
+    elif isinstance(data,collections.abc.Mapping):
+        # data == dict
+        keys = data.keys()
+
+        # replace the keylist with a string if required
+        if is_float_list(keys):
+            path['pretty'].append(options['float list designator'])
+            replaced_keys = True
+        elif is_int_list(keys):
+            path['pretty'].append(options['int list designator'])
+            replaced_keys = True
+        else:
+            replaced_keys = False
+
+        # loop over real keys
+        for x in keys:
+
+            # we must match a regex to strings, so convert
+            # if required.
+            if not isinstance(x,str):
+                xstring = str(x)
+            else:
+                xstring = x
+
+            if regex_compiled['skiplist'] and \
+               regex_compiled['skiplist'].search(xstring):
+                # if in skiplist, skip
+                continue
+            else:
+                path['raw'].append(xstring)
+                if not replaced_keys:
+                    path['pretty'].append(xstring)
+                
+                # if first in path is not in the startswith list, skip
+                if regex_compiled['startswith'] and \
+                   not regex_compiled['startswith'].search(path['pretty'][0]):
+                    path['raw'].pop()
+                    if not replaced_keys:
+                        path['pretty'].pop()
+                    continue
+
+                if not path['pretty'] == prevout:
+                    t = tuple(path['pretty'])
+
+                    # only output more if we haven't already (shouldn't happen!)
+                    if not t in prevoutdict:
+                        outstring = options['separator'].join(path['pretty'])
+
+                        # defaul to the data being ok to output
+                        data_ok = True
+                        
+                        # if we have a greplist, all greplists must match at least one
+                        # string in the path
+                        for regex in regex_compiled['greplist']:
+                            if not any(regex.search(item) for item in path['pretty']):
+                                data_ok = False
+
+                        # if we have an endlist, our final item in the path should match it
+                        if regex_compiled['endswith'] and \
+                           not regex_compiled['endswith'].search(path['pretty'][-1]):
+                            data_ok = False
+
+                        if data_ok:
+                            if options['suppress']:
+                                if cache and outstring.startswith(cache[-1]):
+                                    cache[-1] = outstring
+                                else:
+                                    parse_flush()
+                                    cache = [outstring]
+                            else:
+                                print(outstring)
+
+                        prevoutdict[t] = True
+
+                    # set prevout so we don't output two identical
+                    # lines in a row: note this is a tuple as it's never changed
+                    prevout = tuple(path['pretty'])
+
+                parse(data[x],path)
+                path['raw'].pop()
+                if not replaced_keys:
+                    path['pretty'].pop()
+
+        if replaced_keys:
+            path['pretty'].pop()
+    else:
+        # other (assume scalar/string)
+        pass
+
+def parse_flush():
+    global cache
+    for line in cache:
+        print(line)
+    cache = []
+
+############################################################
+# parse command line args
+json_filename = sys.argv[1]
+greplist = []
+skiplist = []
+startswithlist = []
+endswithlist = []
+for s in sys.argv[2:]:
+    if s.startswith('--'):
+        # options
+        pass
+    elif s.startswith('-'):
+        skiplist.append(s[1:])
+    elif s.startswith('+'):
+        greplist.append(s[1:])
+    elif s.startswith('$'):
+        endswithlist.append(s[1:])
+    elif s.startswith('^'):
+        startswithlist.append('^' + s[1:])
+
+# pre-compile regular expressions
+regex = {
+    'startswith' : ('(?:%s)' % '|'.join(startswithlist)) if startswithlist else None,
+    'endswith' : ('(?:%s)' % '|'.join(endswithlist)) if endswithlist else None,
+    'skiplist' : ('(?:%s)' % '|'.join(skiplist)) if skiplist else None,
+}
+regex_compiled = {
+    'skiplist' : None,
+    'endswith' : None,
+    'startswith' : None,
+    'greplist' : []
+}
+
+for grep in greplist:
+    regex_compiled['greplist'].append(re.compile(grep))
+for r in regex:
+    if r and regex[r]:
+        regex_compiled[r] = re.compile(regex[r])
+
+# load data and parse it
+parse(load_json_data(json_filename))
+parse_flush()
+
+# done