Source code for nbtoolbelt.nbstatsapp

"""
Main for nbstats

Copyright (c) 2017 - Eindhoven University of Technology, The Netherlands

This software is made available under the terms of the MIT License.

Summarize Jupyter notebooks on the command line.
"""

from argparse import _ArgumentGroup, SUPPRESS
from pathlib import Path
from textwrap import dedent
from typing import Optional, List

from nbformat import NotebookNode

from nbtoolbelt.arguments import NegatableAction
from nbtoolbelt.counting import nb_metadata, nb_other_metadata, nb_extra_fields, nb_cell_stats, extract_aggregate
from nbtoolbelt.printing import print_dict
from nbtoolbelt.processing import ProcessingResultType
from nbtoolbelt.toolbaseapp import Tool

TOOL = 'stats'


[docs]class StatsTool(Tool): """Collect and show a statistical summary of each notebook. Also compute some statistics over all notebooks. """ def __init__(self) -> None: super().__init__( name='nb' + TOOL, description=dedent("""\ Summarize Jupyter notebooks on stdout with statistics. By default summarizes all notebook data. Limit to specific items by passing options. """), action='summarize' )
[docs] def process_nb(self, nb: NotebookNode, nb_path: Path) -> ProcessingResultType: """Collect and present statistics about notebook nb. :param nb: notebook to count :param nb_path: path to ``nb`` :return: sequence of resulting (notebook, notebook-path) pairs """ args = self._args output = {} # to accumulate JSON output # always print summary of required notebook metadata metadata = nb_metadata(nb) print_dict(metadata, 'Notebook metadata') output['notebook_metadata'] = metadata # print notebook-level optional metadata fields, if desired if args.metadata: other_metadata = nb_other_metadata(nb) print_dict(other_metadata, 'Other notebook metadata fields') output['notebook_other_metadata'] = other_metadata # print the notebook extra metadata, if desired if args.extra: extra_fields = nb_extra_fields(nb) print_dict(extra_fields, 'EXTRA NOTEBOOK FIELDS') output['notebook_extra_fields'] = extra_fields cell_stats = nb_cell_stats(nb, args) # print desired cell statistics keys = [] if args.cell_types: print_dict(cell_stats['cell_types'], "Cell types") keys.append('cell_types') if args.sources: print_dict(cell_stats['sources'], "Cell sources") keys.append('sources') if args.metadata or args.tags: print_dict(cell_stats['cell_metadata'], "Cell metadata fields") keys.append('cell_metadata') if args.attachments: print_dict(cell_stats['cell_attachments'], "Cell attachments") keys.append('cell_attachments') if args.outputs or args.streams or args.errors: print_dict(cell_stats['code_outputs'], "Code cell outputs") keys.append('code_outputs') if args.execution: print_dict(cell_stats['code_execution'], 'Code cell execution') keys.append('code_execution') if args.extra: print_dict(cell_stats['cell_extra'], "EXTRA CELL FIELDS") keys.append('cell_extra') output.update({key: value for key, value in cell_stats.items() if key in keys}) self._aggregate['outputs'][-1].update(output) return []
[docs] def process_collected_data(self): super().process_collected_data() # print some aggregation results args = self._args if len(args.notebooks) <= 1: return # print header print('\nTotals\n======\n') aggregate = self._aggregate['outputs'][0] # print notebook-level optional metadata fields, if desired if args.metadata: other_metadata = extract_aggregate(aggregate.get('notebook_other_metadata', {})) print_dict(other_metadata, 'Other notebook metadata fields') # print the notebook extra metadata, if desired if args.extra: extra_fields = extract_aggregate(aggregate.get('notebook_extra_fields', {})) print_dict(extra_fields, 'EXTRA NOTEBOOK FIELDS') cell_stats = aggregate # print desired cell statistics if args.cell_types: cell_types = extract_aggregate(cell_stats.get('cell_types', {})) print_dict(cell_types, "Cell types") if args.sources: cell_sources = extract_aggregate(cell_stats.get('sources', {})) print_dict(cell_sources, "Cell sources") if args.metadata or args.tags: cell_metadata = extract_aggregate(cell_stats.get('cell_metadata', {})) print_dict(cell_metadata, "Cell metadata fields") if args.attachments: cell_attachments = extract_aggregate(cell_stats.get('cell_attachments', {})) print_dict(cell_attachments, "Cell attachments") if args.outputs or args.streams or args.errors: code_outputs = extract_aggregate(cell_stats.get('code_outputs', {})) print_dict(code_outputs, "Code cell outputs") if args.execution: # delete 'maximum In[#]' from collected outputs, because its statistics don't make much sense # TODO improve the following deletion; also: it mentions three attributes twice if 'code_execution' in cell_stats and 'maximum In[#]' in cell_stats['code_execution']: del cell_stats['code_execution']['maximum In[#]'] code_execution = extract_aggregate(cell_stats.get('code_execution', {})) print_dict(code_execution, 'Code cell execution') if args.extra: cell_extra = extract_aggregate(cell_stats.get('cell_extra', {})) print_dict(cell_extra, "EXTRA CELL FIELDS")
[docs] def config_tool_args_parsing(self, group: _ArgumentGroup) -> None: group.add_argument('--all', action=NegatableAction, dest='all_stats', default=SUPPRESS, help='show all statistics' + ' (default: {})'.format(self._args.all_stats)), group.add_argument('-c', '--cell-types', action=NegatableAction, default=SUPPRESS, help='count cell types' + ' (default: {})'.format(self._args.cell_types)) group.add_argument('-s', '--sources', action=NegatableAction, default=SUPPRESS, help='statistics for cell sources' + ' (default: {})'.format(self._args.sources)) group.add_argument('-m', '--metadata', action=NegatableAction, default=SUPPRESS, help='show notebook metadata and count cell metadata' + ' (default: {})'.format(self._args.metadata)) group.add_argument('-t', '--tags', action=NegatableAction, default=SUPPRESS, help='count individual cell tags' + ' (default: {})'.format(self._args.tags)) group.add_argument('-a', '--attachments', action=NegatableAction, default=SUPPRESS, help='count cell attachment MIME types' + ' (default: {})'.format(self._args.attachments)) group.add_argument('-o', '--outputs', action=NegatableAction, default=SUPPRESS, help='count code cell outputs' + ' (default: {})'.format(self._args.outputs)) group.add_argument('--streams', action=NegatableAction, default=SUPPRESS, help='count code cell output stream names' + ' (default: {})'.format(self._args.streams)) group.add_argument('-e', '--errors', action=NegatableAction, default=SUPPRESS, help='count code cell error names' + ' (default: {})'.format(self._args.errors)) group.add_argument('-x', '--execution', action=NegatableAction, default=SUPPRESS, help='statistics for code execution' + ' (default: {})'.format(self._args.execution)) group.add_argument('--extra', action=NegatableAction, default=SUPPRESS, help='report extra fields outside metadata' + ' (default: {})'.format(self._args.extra))
[docs] def check_and_adjust_arguments(self): args = self._args # if --all, summarize everything if args.all_stats: args.cell_types = True args.sources = True args.metadata = True args.tags = True args.attachments = True args.outputs = True args.streams = True args.errors = True args.execution = True args.extra = True
[docs]def main(cli_args: Optional[List[str]] = None) -> int: return StatsTool().main(cli_args)
if __name__ == "__main__": import sys sys.exit(main())