Python Forum

Full Version: Numpy Structure and Efficiency
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
I have the following code:

import matplotlib.pyplot as plt
import matplotlib.cbook as cbook

import numpy as np
import math

SIGMA = 5.67051e-5														# Stefan-Boltzmann Constant
PI    = math.pi															# Pi
M_SUN = 1.989e33														# Solar mass (g)
L_SUN = 3.826e33														# Solar luminosity (ergs/s)
R_SUN = 6.9599e10														# Solar radius (cm)
YEAR  = 3.15570e7														# AU year (s)


sa_sun = 4 * PI * math.pow(R_SUN, 2)									# Solar surface area
t_sun  = L_SUN / (sa_sun * SIGMA)										# Solar surface temperature



# Converts Fortran output (xd+/-y) into float
def convert(oldValue):
	if oldValue.find('D') != -1:
		newValue = float(oldValue.replace('D', 'e'))
	else:
		if oldValue.find('+') != -1:
			mantissa = float(oldValue[0: oldValue.find('+')])
			exponent = int(oldValue[oldValue.find('+'):])
			
		else:
			mantissa = float(oldValue[0: oldValue.find('-')])
			exponent = int(oldValue[oldValue.find('-'):])

		newValue = mantissa * (math.pow(10, exponent))
	
	return newValue



# Reads in data from STELCOR file
def readData(datafile):
	
	data = np.array([])
	
	file = open(datafile)

	for line in file:
		if (line[1:6] == "hydro") or (line[1:6] == "HYDRO"):
			time      = convert(line[21:35])
			delta_t	  = convert(line[36:50])
			mass      = convert(line[51:65])
			radius    = convert(line[66:80])
			lum_core  = convert(line[81:95])
			lum_tot   = convert(line[96:110])
			flux      = convert(line[111:125])
			ratio     = float(line[125:137])
			data = np.append(data, [time, delta_t, mass, radius, lum_core, lum_tot, flux, ratio], axis = 0)
			
	file.close()
	
	return data



if __name__ == "__main__":
	stellarData = np.array([])

	stellarData = readData("dummymain.lst")
	#CM_Graph(?,?)
I am adding the data from a textfile that contains thousands of lines, similar to this:

Output:
hydro output: 1 1.05200D+09 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 4.99722-235 0.0499938 hydro output: 2 2.10400D+09 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 2.88583-105 0.0499938 hydro output: 3 3.15600D+09 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 3.81557D-62 0.0499938 hydro output: 4 4.20800D+09 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 1.19575D-40 0.0499938 hydro output: 5 5.26000D+09 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 8.64733D-28 0.0499938 hydro output: 6 6.31200D+09 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 3.04933D-19 0.0499938 hydro output: 7 7.36400D+09 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 3.72610D-13 0.0499938 hydro output: 8 8.41600D+09 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 1.32716D-08 0.0499938 hydro output: 9 9.46800D+09 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 4.49291D-05 0.0499938 hydro output: 10 1.05200D+10 1.05200D+09 9.94376D+31 3.66754D+10 7.52265D+31 7.52265D+31 2.93595D-02 0.0499938
The system strips out the initial text, ignores the incremental step (integer) value and populates an array with the converted float data.

All the data is added to a Numpy array of 1 dimension.

I originally envisaged a structure such as:

Output:
[[1052000000.0, 1052000000.0, 9.94376e+31, 36675400000.0, 7.52265e+31, 7.52265e+31, 4.9972200000000005e-235, 0.0499938], [...], ...]
However, I get these all as a single element in each array position@

Output:
[1052000000.0, 1052000000.0, 9.94376e+31, 36675400000.0, 7.52265e+31, 7.52265e+31, 4.9972200000000005e-235, 0.0499938]
I later want to reference all rows of data, but only the data in a specific position in that row (all data in column 3, for example).

How might I alter this and is there a more efficient way of collecting/storing the data?
One advantage of posting to a forum is that you then relax; nothing you can do until someone answers.

It is at those moments your brain actually works!

I solved the issue simply by defining data as an empty 2D array. Then I populated it with another Numpy array.

# Reads in data from STELCOR file
def readData(datafile):
	
	data = np.empty((0, 8), float)
	
	file = open(datafile)

	for line in file:
		if (line[1:6] == "hydro") or (line[1:6] == "HYDRO"):
			time      = convert(line[21:35])
			delta_t	  = convert(line[36:50])
			mass      = convert(line[51:65])
			radius    = convert(line[66:80])
			lum_core  = convert(line[81:95])
			lum_tot   = convert(line[96:110])
			flux      = convert(line[111:125])
			ratio     = float(line[125:137])
			data = np.append(data, np.array([[time, delta_t, mass, radius, lum_core, lum_tot, flux, ratio]]), axis = 0)
			
	file.close()
	
	return data
All done; thank you for reading.
I played with your code, not excatly want you are expecting but here you may find some tricks to help you.

If you can estimate the size of the initial array, then you can drastically speed-up the code without appending or concatenating



import numpy as np
import re, time

# Way 1: if you cannot estimate the size of the data array
def GetData(Data, Line):
    Line = Line.replace('D', 'E')
    Variables = re.split(r"\s+", Line)
    # time      = Variables[3]
    # delta_t   = Variables[4]
    # mass      = Variables[5]
    # radius    = Variables[6]
    # lum_core  = Variables[7]
    # lum_tot   = Variables[8]
    # flux      = Variables[9]
    # ratio     = Variables[10]
    
    Array = np.empty(8)
    Array=[Variables[i] for i in range(3, 11)]
    Data = np.vstack((Data, Array))
    return Data

# a single line is used here instead of a complete text file
Extract = "hydro output:     1    1.05200D+09    1.05200D+09    9.94376D+31    3.66754D+10    7.52265D+31    7.52265D+31    4.99722D-235   0.0499938"


t0=time.time()
# Data array is initialized
Data=np.empty(8, dtype=float)

# A n lines text file is simulated using a loop
n=10_000
for i in range(n):
    if "HYDRO" in Extract.upper(): Data=GetData(Data, Extract)
    
t1=time.time()    
# now the first first empty line is removed
Data=np.delete(Data, 0, axis=0)

# the array is composed of string so far, it's converted into float in a single step (faster than converting numbers one by one)
Data=Data.astype(float)

# remember :
# time      = column 0
# delta_t   = column 1
# mass      = column 2
# radius    = column 3
# lum_core  = column 4
# lum_tot   = column 5
# flux      = column 6
# ratio     = column 7

# if you want all radius data for example:
Radius=Data[:, 3]
t2=time.time()
print(f"Duration reading lines={t1-t0}")
print(f"Duration converting data={t2-t1}")



## way 2: if you can estimate the size of the data array (can be the max number of lines?)
n=10_000
Data2 = np.empty((n,8))
for i in range(n):
    if "HYDRO" in Extract.upper():
        Extract = Extract.replace('D', 'E')
        Variables = re.split(r"\s+", Extract)
        Data2[i, :]=[Variables[j] for j in range(3, 11)]

Data2=Data2.astype(float)

flux=Data[:, 6]
t3=time.time() 
print(f"Duration way2={t3-t2}")

MaxDifference=np.max(np.absolute(Data-Data2))
print(f"Max difference={MaxDifference}")