NIFTY 100 Stocks— Kaggle sample dataset
Mostly of my demonstrations use the data this dataset
The Stock Market Data — Nifty 100 Stocks (1 min) data | Kaggle from is an excellent public dataset for Data Engineering and ML.
Volumetrics:
102 files — 66,141,637,537 bytes — 63,944,373 rows
Oldes Date: 09-Jan-2015
Latest Date: 25–Oct-2022
Average Row Count: 35270 Rows
Columns: 58
Companies: 100
CSV File count: 100
Average file size: 655MB (uncompressed CSV)
Full Extract
My demonstrations utilize a full extract from an Azure Data Lake storage account, which I have imported into my raw container.
Daily Data set Extract
I also use daily extracts for some of the demonstrations. The code below reads a number of files from the current directory and creates a folder for each company with a file for each date in the dataset.
import os
import pandas as pd
from datetime import datetime
import numpy as np
def enumerate_files(folder_name: str) -> list:
"""enumerate_files() : get a list of all csv files in the provided folder
Returns:
list: list of csv files
"""
print('--- enumerate_files()')
all_files = os.listdir(folder_name)
csv_files = [ file for file in all_files if ".csv" in file ]
return csv_files
def read_csv( filename: str ) -> pd.DataFrame:
"""read_csv() : read each file and returns a pandas dataframe
Args:
filename (str): A CSV file
Returns:
pd.DataFrame: a Pandas Dataframe
"""
print('--- read_csv()')
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S%z')
df = pd.read_csv(filename, parse_dates=['date'], date_parser=dateparse)
df = df.convert_dtypes()
return df
def create_date_column( df : pd.DataFrame) -> pd.DataFrame:
"""create_date_column()
Add new Data and Timestamp columns into the Dataframe
Args:
df (pd.DataFrame): original DF with 'date' column
Returns:
pd.DataFrame: new DF with 'Timestamp' and 'Date' columns
"""
print('--- create_date_column()')
df['Timestamp'] = df['date']
df.drop(['date'], axis=1)
df['Date'] = df['Timestamp'].dt.date.astype('datetime64[ns]')
#move Date and Timestamp columns to front
cols = list(df)
cols.insert(0, cols.pop(cols.index('Date')))
cols.insert(1, cols.pop(cols.index('Timestamp')))
df = df.loc[:, cols]
return df
def get_dates_list(df : pd.DataFrame) -> list:
"""get_dates_list
Returns a unique list of dates from from the DataFrame
Args:
df (pd.DataFrame): a Dataframe with a Date Column
Returns:
list: list of unique dates (sorted)
"""
print('---get_dates_list()')
dates_list = df['Date'].dt.date.unique().tolist()
print(dates_list[:20])
return dates_list
def sort_and_index_df( df: pd.DataFrame ) -> pd.DataFrame:
"""Index the Dataframe
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: _description_
"""
# Sort by Date
df = df.sort_values(by='Date')
# Index Dataframe
df = df.set_index('Date')
return df
##############################################################
os.system('clear')
BASE_PATH = '~/Documents/sample_stock'
compression_opts = dict(method='zip',
archive_name='out.csv')
csv_list = enumerate_files(BASE_PATH)
for file in csv_list:
company_name = file.split('_')[0]
print(company_name)
df = read_csv(BASE_PATH +'/' + file)
df = create_date_column( df )
print(df.dtypes)
date_list = get_dates_list(df)
df = sort_and_index_df(df)
# Create folder
try:
os.mkdir(BASE_PATH + '/' + company_name)
except Exception as e:
print('Folder exists')
for record_date in date_list:
sub_df = pd.DataFrame(df.loc[np.datetime64(record_date)])
date = record_date.strftime('%Y-%m-%d')
csv_name = BASE_PATH + '/' + company_name + '/' + company_name + '_' + date + '.zip'
print('Date: '
,date
,' Close mean value: '
,sub_df.groupby('Date')['close'].mean()
,' Target: '
,csv_name)
sub_df.to_csv( csv_name,
header=True,
index=False,
compression=compression_opts,
mode='w'
)
The resultant files will have the name of the company and the date and will be saved as ZIPped csv files.