cheatSheet

nice excel like functions in python

http://pbpython.com/excel-pandas-comp.html

The following code works

 def calcindex3a(d, D, maxTTR, eventStart=2,  alpha=.001,):
    sp = np.exp(-1*alpha*maxTTR)

    DRecoveryRatio = np.empty([len(D),1])
    for row in d.index:
        if maxTTR.iloc[row][0]==0:
            DRecoveryRatio[row] = 1
        else:
            DRecoveryRatio[row]=d.iloc[row][maxTTR.iloc[row][0]]/D.iloc[row][maxTTR.iloc[row][0]]
    k=d[eventStart]/D[eventStart]
    k=pd.DataFrame(DRecoveryRatio,index=list(range(len(DRecoveryRatio))))
    ind_a = sp*k*DRecoveryRatio
    return ind_a[0]

 

However if we hcange the DRecoveryRatio  to list or dictionary it will be different:

for dictionary it changes the order of rows so the ind_a won't be calculated correctly

 

In [2]:
list('ABe333CD')
Out[2]:
['A', 'B', 'e', '3', '3', '3', 'C', 'D']
In [1]:
# work with directory and file
In [6]:
pathDir = os.path.join("D:\\Dropbox","Python")
if not os.path.isdir(pathDir):
    pathDir = os.path.join("C:\\Users\\Manic Bird\\Dropbox","Python")
os.chdir(pathDir)
In [8]:
os.chdir(os.path.join("D:\\Dropbox","Python","notebooks"))
In [ ]:
#count the number of nan in a pandas dataset
df.isnull().sum()
 

play with date

In [ ]:
import dateutil.parser
import datetime
import time
import os

last_date = df.iloc[-1].name
next_unix = last_date.timestamp()
# next_unix  dateutil.parser.parse(last_date)#convert string date col to unix date col
#next_unix = next_unix + datetime.timedelta(1)
type(next_unix)
one_day = 86400
for i in forecast_set:
    next_unix += oneDay
    next_date = datetime.datetime.fromtimestamp(next_unix)
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]
df[forecast_col].plot()
df['forecast'].plot()
plt.show()
 

lists

In [ ]:
change = [1, 'pennies', 2, 'dimes', 3, 'quarters']
# also we can go through mixed lists too
# notice we have to use %r since we don't know what's in it
for i in change:
    print ("I got %r" % i)
In [ ]:
for x in range(10):
    print(x, end='')
print("")
import numpy
my_array = numpy.array([[2, 5], 
                        [3, 7],
                        [1, 3],
                        [4, 0]])
m = numpy.min(my_array,axis=0)
print(m)
import pandas as pd
df = pd.DataFrame(my_array)
print(df)
In [ ]:
dimension= [int (i) for i in input().split()]

arr =[]
for i in range(dimension[0]):
    arr.append( [int(x) for x in input().split()]  )

maxMin = numpy.max(numpy.min(arr, axis=1))
print("max min is: %s" %maxMin)
In [ ]:
    N, M= [int (i) for i in input().split()]
    arr=[]
    for i in range(N):
        arr.append( [int(x) for x in input().split()]  )
    ma=arr[0][0]
    for i in range(N):
        mi=arr[i][0]
        for j in range(1,M):
            if (arr[i][j]<mi):
                mi=arr[i][j]
        if (mi>ma):
            ma=mi
    print(ma)
In [ ]:
import numpy as np
n = input()
A = np.array([input().split() for _ in range(int(n))], int)
B = np.array([input().split() for _ in range(int(n))], int)
print(np.dot(A,B))
 

This is practicing some sql with pyrhon

 

First we create a database or connect to one

In [ ]:
import os
import pandas as pd
import sqlite3
dbFullPath = "D:\\Dropbox\\Python\\machineLearning.sqlite"
if not os.path.isfile(dbFullPath):
    dbFullPath = "C:\\Users\\Manic Bird\\Dropbox\\Python\\machineLearning.sqlite"
print(dbFullPath)
In [ ]:
conn = sqlite3.connect(dbFullPath)
table = pd.read_sql_query("SELECT name from sqlite_master", conn)
table.head()
In [ ]:
covData = pd.read_sql_query("Select * from covers WHERE Cover_Type <3 ", conn)

print(covData.head(2))
print(covData[covData['Cover_Type']==1].head(2))
In [ ]:
conn1 = sqlite3.connect(dbFullPath)
c = conn1.cursor()

c.execute("SELECT name from sqlite_master where type = 'table'")

print(c.fetchall())

c.execute("SELECT name FROM sqlite_master")
print(c.fetchall())
c.execute("SELECT * FROM covers")
fieldnames=[f[0] for f in c.description]
print(fieldnames)
In [ ]:
sqlFile = "D:\\Dropbox\\Python\\machineLearning.sqlite"
import sqlite3
conn = sqlite3.connect(sqlFile)
c = conn.cursor()
import pandas as pd
df=pd.read_sql_query("select * from covers where Cover_Type <3 ", conn)
df[df['Cover_Type']==3].head()
 

the column 'as' has problem and cannot accessed in the queries!! weire!

In [ ]:
c.execute("SELECT el, sl, ve, sh1, hddi FROM covers limit 2;")
print(c.fetchall())
In [ ]:
with conn:
    cnx = conn.cursor()
    cnx.execute("SELECT name FROM sqlite_master WHERE type='table';")
    print(cnx.fetchall())

    cnx.execute("SELECT * from covers WHERE Cover_Type <2 LIMIT 3")
    covT = cnx.fetchall()
    print(covT)

    cnx.execute("SELECT * FROM covers LIMIT 2")
    k=0
    for row in cnx:
        if k>20:
            break
        print(row)
        k+=1
In [53]:
c.close()
In [56]:
import pandas as pd
query="SELECT * FROM covers WHERE Cover_Type < 2"
db = pd.read_sql_query(query, conn, )
db2 = db.head()
 

write to a file

In [57]:
db2.to_csv("test.csv")
 

read data

In [ ]:
import pandas as pd
try:
    df = pd.read_csv("h.csv")
    df.columns.values[0] = "index1"
    df.set_index("index1")
    print (df.head())
except:
    print("che tokhmi")
finally:
    print("chi shod?")
 

read data from a web page:

In [5]:
import pandas as pd
import html5lib
statessss = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
print(statessss[0][0][1:4])
 
1    AL
2    AK
3    AZ
Name: 0, dtype: object
In [ ]:
import pandas as pd
import html5lib, sys
url = 'https://www.drugs.com/top200.html'
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
drugData = pd.read_html(url,header=hdr )
In [27]:
import urllib.request


url1 = 'https://www.drugs.com/top200.html'
def dataFromHtmlInPy(url):

    headers = { 'Accept' : '*/*',
                'User-Agent' : 'Mozilla/5.0',
                'Refers' : 'http://www.drugs.com',
                'Connection' : 'keep-alive'
              }

    getContents = urllib.request. requests.get(url,headers=headers).content
    dataFromHtm=pd.read_html(BytesIO(getContents))
        #dataFromHtm = BeautifulSoup(getContents, "lxml")
    return dataFromHtm

data=pd.DataFrame(dataFromHtmlInPy(url=url1))
print(data.to_html())
 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-89667f2930a7> in <module>()
     15     return dataFromHtm
     16 
---> 17 data=pd.DataFrame(dataFromHtmlInPy(url=url1))
     18 print(data.to_html())

<ipython-input-27-89667f2930a7> in dataFromHtmlInPy(url)
     10               }
     11 
---> 12     getContents = requests.get(url,headers=headers).content
     13     dataFromHtm=pd.read_html(BytesIO(getContents))
     14         #dataFromHtm = BeautifulSoup(getContents, "lxml")

NameError: name 'requests' is not defined
In [ ]:
import pandas as pd
import html5lib, sys
import urllib.request

def read_page(url):
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
    headers = {'User-Agent': user_agent}
    req = urllib.request.Request(url ,headers=hdr)
    conn =urllib.request.urlopen(req)
    page = conn.read()
    from io import BytesIO
    df=pd.read_html(BytesIO(page), header=0)
    return df[0]
In [64]:
#url = 'https://www.drugs.com/top200.html'
ur = 'https://www.drugs.com/top200'
#years =list( range(2003,2010))+['', 2011,2012]
years = list(range(2003,2010))
url_list = {}
for y in years:
        url_list[str(y)]= ur + '_' + str(y) + '.html'
url_list[2010]='https://www.drugs.com/top200.html'

pdDic = {}

for y, url in url_list.items():
    pdDic[y]=read_page(url)
    
url_list
#drugStat = pd.read_html('https://www.drugs.com/top200.html')
Out[64]:
{'2005': 'https://www.drugs.com/top200_2005.html',
 '2009': 'https://www.drugs.com/top200_2009.html',
 '2007': 'https://www.drugs.com/top200_2007.html',
 '2003': 'https://www.drugs.com/top200_2003.html',
 '2006': 'https://www.drugs.com/top200_2006.html',
 '2004': 'https://www.drugs.com/top200_2004.html',
 '2008': 'https://www.drugs.com/top200_2008.html',
 2010: 'https://www.drugs.com/top200.html'}
In [65]:
pdDic['2003'].head()
Out[65]:
  Rank Drug Current Manufacturer Total Sales ($000) % Change 2002
0 1.0 Lipitor Pfizer Inc. 5538587 0.0%
1 2.0 Prevacid Takeda Pharmaceuticals U.S.A., Inc. 3568558 0.0%
2 3.0 Zocor Merck & Co., Inc. 3277856 0.0%
3 4.0 Nexium AstraZeneca Pharmaceuticals LP 2700297 0.0%
4 5.0 Zoloft Pfizer Inc. 2580509 0.0%
In [51]:
pdD
Out[51]:
[2003, 2004, 2005, 2006, 2007, 2008, 2009, '', 2011, 2012]
In [12]:
url = "http://www.gocomics.com/calvinandhobbes"
url = "http://www.gocomics.com/calvinandhobbes"
req = urllib.request(url, headers={'User-Agent' : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30"}) 
con = urllib.urlopen(req)
print (con.read())
 
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-bc485214819d> in <module>()
      1 url = "http://www.gocomics.com/calvinandhobbes"
      2 url = "http://www.gocomics.com/calvinandhobbes"
----> 3 req = urllib.request(url, headers={'User-Agent' : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30"})
      4 con = urllib.urlopen(req)
      5 print (con.read())

TypeError: 'module' object is not callable
 

executing shell commands

In [ ]:
!dir

# this will execute and show the output from

# all code cells of the specified notebook

%run optimization.ipynb
In [ ]:
# dont try this !conda install ipyparallel
 

Exception handling

In [ ]:
def KelvinToFahrenheit(Temperature):
   assert (Temperature >= 0),"Colder than absolute zero!"
   return ((Temperature-273)*1.8)+32
print (KelvinToFahrenheit(273))
print (int(KelvinToFahrenheit(505.78)))
print (KelvinToFahrenheit(-5))
In [ ]:
try:
   fh = open("testfile", "w")
   fh.write("This is my test file for exception handling!!")
except IOError:
   print ("Error: can\'t find file or read data")
else:
   print ("Written content in the file successfully")
   fh.close()
 

widgets

In [ ]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets

def f(x):
    x = x*2
    return x

interact(f, x=10);
In [ ]:
from pythreejs import *

f = """
function f(origu,origv) {
    // scale u and v to the ranges I want: [0, 2*pi]
    var u = 2*Math.PI*origu;
    var v = 2*Math.PI*origv;
    
    var x = Math.sin(u);
    var y = Math.cos(v);
    var z = Math.cos(u+v);
    
    return new THREE.Vector3(x,y,z)
}
"""

surf_g = ParametricGeometry(func=f);
surf = Mesh(geometry=surf_g, material=LambertMaterial(color='green', side='FrontSide'))
surf2 = Mesh(geometry=surf_g, material=LambertMaterial(color='yellow', side='BackSide'))
scene = Scene(children=[surf, surf2, AmbientLight(color='#777777')])
c = PerspectiveCamera(position=[2.5, 2.5, 2.5], up=[0, 1, 1],
                      children=[DirectionalLight(color='white',
                                                 position=[3, 5, 1],
                                                 intensity=0.6)])
Renderer(camera=c, scene=scene, controls=[OrbitControls(controlling=c)])
 

Regression and Anova

 

Anova

In [ ]:
import os

import statsmodels as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.graphics.factorplots import interaction_plot

import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
pathDir = "D:\\Dropbox\\resiliency\\Codes\\Python"
if not os.path.isdir(pathDir):
    pathDir = "C:\\Users\\Manic Bird\\Dropbox\\resiliency\\Codes\\Python"
try:
    os.chdir(pathDir)
    os.chdir('..')
    os.chdir('..')
    print(os.getcwd())
    dataDir = os.path.join(os.getcwd(),"outputs")

    dt1 = pd.read_excel(os.path.join(dataDir, "bus57_96h 20170214.xlsm"), 
              sheetname = 'bus57_96h_Out', header = 0 , parse_cols = "A:E" )
    dt1.dropna(inplace = True)
    print(dt1.columns)
except AttributeError as e:
    print(e)
finally:
    print("error finaly")
In [ ]:
mods = ['Index1~A+B+C', 'Index1~A:B+C', 'Index1~A+B*C']
#mods = ['Index1~A+B+C', 'Index1~A:B+C', 'Index1~A+B*C']

linModels = {}
anovas = {}
i=0
for m in mods:
    linModels[i] = ols(m , data=dt1).fit()
    anovas[i] = sm.stats.anova.anova_lm(linModels[i])
    print(anovas[i])
    i+=1
In [ ]:
ip = interaction_plot(dt1['C'], dt1['B'], colors=['red','blue'], markers=['D','^'], ms=10, response=dt1['Index1'])
plt.show()
In [ ]:
os.chdir("C:\\Users\\Manic Bird\\Dropbox\\Python")
data = pd.read_csv("ToothGrowth.csv")
 

Regression

 

change directory to where the data is saved read excel data with headers define new columns 'HL_Percent' and 'percent_Change' and dropping the unwanted columns

In [9]:
import math
import numpy as np
import pandas as pd
df = pd.read_excel('GOOGL.xls',"Worksheet1" , header=0)
df.set_index("Date", inplace=True)
# or
#df = pd.read_excel('GOOGL.xls' , header=0 , index_col=0)

print(df.head())

df['HL_Percent']=(df['Adj. High']-df['Adj. Low'])/df['Adj. Low']*100
df['percent_Change']=(df['Adj. Close']-df['Adj. Open'])/df['Adj. Open']*100
df=df[['Adj. Close','HL_Percent','percent_Change','Adj. Volume']]
df.rename(index=str, columns={"Adj. Close" : "Adj_Close", "Adj. Volume" : "Adj_Volume"}, inplace=True)
print(df.head())
forecast_col = 'Adj_Close'
forecast_out = int (math.ceil(0.01*len(df)))
df['labl'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
x=np.array(df.drop(['labl'],1))
x=preprocessing.scale(x)
y=np.array(df['labl'])
x_forPred = x[-forecast_out:] # after preprocessing
x_original = x[:-forecast_out] #after preprocessing
y_forPred = y[-forecast_out:]
y_original = y[:-forecast_out]


print(len(x_original), len(x), len(x_forPred))
 
3111 3111
In [ ]:
import dateutil.parser
import datetime
import time
last_date = df.iloc[-1].name
print(last_date)
last_unix = dateutil.parser.parse(last_date)
print(last_unix)
oneDay = 86400
# add 3 days to current day
next_unix = last_unix + datetime.timedelta(3)
print(next_unix)
In [ ]:
from sklearn import preprocessing, svm
from sklearn import cross_validation # to create test and train sets

x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y,test_size = 0.2)
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)
kernels = ['rbf', 'poly' ]
for k in kernels:
    clf2 = svm.SVR(k)
    clf2.fit(x_train, y_train)
    print("accuracy for the kernel %s is %.2f"% (k, clf2.score(x_test,y_test)))
In [ ]:
import statsmodels.formula.api as sm
regModel = sm.ols(formula="lbl~Adj_Close+percent_Change" , data=df)
regRes = regModel.fit()

print(regRes.summary())
print(regRes.params[1])
 

the standard functions filter(), map() and reduce() to do various things with that list

In [ ]:
foo = [2, 18, 9, 22, 17, 24, 8, 12, 27]
print(  filter(lambda x: x * 2 + 10, foo)  )
g=lambda x: x %3 == 0
g(2)
print(filter (g , foo))
 

Optimization

In [ ]:
c = [-1, 4]
A = [[-3, 1], [1, 2]]
b = [6, 4]
x0_bounds = (None, None)
x1_bounds = (-3, None)
from scipy.optimize import linprog
res = linprog(c, A_ub=A, b_ub=b, bounds=(x0_bounds, x1_bounds),
              options={"disp": True})
print(res)
In [34]:
"""
Plot demonstrating the integral as the area under a curve.

Although this is a simple example, it demonstrates some important tweaks:

    * A simple line plot with custom color and line width.
    * A shaded region created using a Polygon patch.
    * A text label with mathtext rendering.
    * figtext calls to label the x- and y-axes.
    * Use of axis spines to hide the top and right spines.
    * Custom tick placement and labels.
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon


def func(x):
    return (x - 3) * (x - 5) * (x - 7) + 85


a, b = 2, 9 # integral limits
x = np.linspace(0, 10)
y = func(x)

fig, ax = plt.subplots()
plt.plot(x, y, 'r', linewidth=2)
plt.ylim(ymin=0)

# Make the shaded region
ix = np.linspace(a, b)
iy = func(ix)
verts = [(a, 0)] + list(zip(ix, iy)) + [(b, 0)]
poly = Polygon(verts, facecolor='0.9', edgecolor='0.5')
ax.add_patch(poly)

plt.text(0.5 * (a + b), 30, r"$\int_a^b f(x)\mathrm{d}x$",
         horizontalalignment='center', fontsize=20)

plt.figtext(0.9, 0.05, '$x$')
plt.figtext(0.1, 0.9, '$y$')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_ticks_position('bottom')

ax.set_xticks((a, b))
ax.set_xticklabels(('$a$', '$b$'))
ax.set_yticks([])

plt.show()
 
In [11]:
%%latex
\begin{align}
a=\frac{1}{2} && b=\frac{1}{3} && c=\frac{1}{4}\\
a && b && c\\
1 && 2 && 3
\end{align}
 
\begin{align} a=\frac{1}{2} && b=\frac{1}{3} && c=\frac{1}{4}\\ a && b && c\\ 1 && 2 && 3 \end{align}
In [19]:
import pandas as pd
a=[1,3]
b=[4,5]
z=zip(a,b)
list(z)
Out[19]:
[(1, 4), (3, 5)]
In [ ]:
"""Examples illustrating the use of plt.subplots().

This function creates a figure and a grid of subplots with a single call, while
providing reasonable control over how the individual plots are created.  For
very refined tuning of subplot creation, you can still use add_subplot()
directly on a new figure.
"""

import matplotlib.pyplot as plt
import numpy as np

# Simple data to display in various forms
x = np.linspace(0, 2 * np.pi, 400)
y = np.sin(x ** 2)

plt.close('all')

# Just a figure and one subplot
f, ax = plt.subplots()
ax.plot(x, y)
ax.set_title('Simple plot')
In [46]:
# Two subplots, the axes array is 1-d
f, axarr = plt.subplots(2, sharex=True)
axarr[0].plot(x, y)
axarr[0].set_title('Sharing X axis')
axarr[1].scatter(x, y)
# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.plot(x, y)
ax1.set_title('Sharing Y axis')
ax2.scatter(x, y)

plt.show()
 
In [47]:
# Three subplots sharing both x/y axes
f, (ax1, ax2, ax3) = plt.subplots(3, sharex=True, sharey=True)
ax1.plot(x, y)
ax1.set_title('Sharing both axes')
ax2.scatter(x, y)
ax3.scatter(x, 2 * y ** 2 - 1, color='r')
# Fine-tune figure; make subplots close to each other and hide x ticks for
# all but bottom plot.
f.subplots_adjust(hspace=0)
plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
plt.show()
 
In [48]:
# row and column sharing
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex='col', sharey='row')
ax1.plot(x, y)
ax1.set_title('Sharing x per column, y per row')
ax2.scatter(x, y)
ax3.scatter(x, 2 * y ** 2 - 1, color='r')
ax4.plot(x, 2 * y ** 2 - 1, color='r')
plt.show()
 
In [ ]:
# Four axes, returned as a 2-d array
f, axarr = plt.subplots(2, 2)
axarr[0, 0].plot(x, y)
axarr[0, 0].set_title('Axis [0,0]')
axarr[0, 1].scatter(x, y)
axarr[0, 1].set_title('Axis [0,1]')
axarr[1, 0].plot(x, y ** 2)
axarr[1, 0].set_title('Axis [1,0]')
axarr[1, 1].scatter(x, y ** 2)
axarr[1, 1].set_title('Axis [1,1]')
# Fine-tune figure; hide x ticks for top plots and y ticks for right plots
plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False)
plt.setp([a.get_yticklabels() for a in axarr[:, 1]], visible=False)

# Four polar axes
f, axarr = plt.subplots(2, 2, subplot_kw=dict(projection='polar'))
axarr[0, 0].plot(x, y)
axarr[0, 0].set_title('Axis [0,0]')
axarr[0, 1].scatter(x, y)
axarr[0, 1].set_title('Axis [0,1]')
axarr[1, 0].plot(x, y ** 2)
axarr[1, 0].set_title('Axis [1,0]')
axarr[1, 1].scatter(x, y ** 2)
axarr[1, 1].set_title('Axis [1,1]')
# Fine-tune figure; make subplots farther from each other.
f.subplots_adjust(hspace=0.3)

plt.show()