Hi! With your invaluable help, I was able to successfully test the following code using 15 gigabyte (~1400 JSON files) data set.
Please suggest a way to fix it.
# Parse a body (dayly/monthly) of JSONs import os import json import pandas as pd import numpy as np from collections import defaultdict import timeit tic=timeit.default_timer() elements_keys = ['created_at', 'text', 'lang', 'geo'] elements = defaultdict(list) for dirs, subdirs, files in os.walk('/home/Dir'): for file in files: if file.endswith('.json'): with open(file, 'r') as input_file: for line in input_file: try: tweet = json.loads(line) for key in elements_keys: elements[key].append(tweet[key]) except: continue df=pd.DataFrame({'created_at': pd.Index(elements['created_at']), 'Text': pd.Index(elements['text']), 'Lang': pd.Index(elements['lang']), 'Geo': pd.Index(elements['geo'])}) df df.to_csv('month_12_01.csv')Now, I am testing the above code on the EC2 instance (i3.x8large) feeding in 230 gigabyte (~44,000 JSON files). However, about 90 minutes into the run, the following error occurs.
Please suggest a way to fix it.
Error:[code]---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-d4376f526220> in <module>()
27 'text': pd.Index(elements['text']),
28 'lang': pd.Index(elements['lang']),
---> 29 'geo': pd.Index(elements['geo'])})
30 df
31 df.to_csv('month_12_01.csv')
/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
264 dtype=dtype, copy=copy)
265 elif isinstance(data, dict):
--> 266 mgr = self._init_dict(data, index, columns, dtype=dtype)
267 elif isinstance(data, ma.MaskedArray):
268 import numpy.ma.mrecords as mrecords
/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
400 arrays = [data[k] for k in keys]
401
--> 402 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
403
404 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
5396 # figure out the index, if necessary
5397 if index is None:
-> 5398 index = extract_index(arrays)
5399 else:
5400 index = _ensure_index(index)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in extract_index(data)
5444 lengths = list(set(raw_lengths))
5445 if len(lengths) > 1:
-> 5446 raise ValueError('arrays must all be same length')
5447
5448 if have_dicts:
ValueError: arrays must all be same length[/code]