Skip to content

Commit

Permalink
pylegal first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
manusw committed Jul 4, 2018
0 parents commit 3299e07
Show file tree
Hide file tree
Showing 35 changed files with 48,034 additions and 0 deletions.
Binary file added sci/.vs/sci/v15/.suo
Binary file not shown.
36 changes: 36 additions & 0 deletions sci/module1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from lxml import html
import requests
from bs4 import BeautifulSoup
import re
from datetime import date, timedelta
import sys
import json

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',

'Content-Length': '18',
#'Cookie' : 'has_js=1; PHPSESSID=tuj8j2vr07fmpq5vc12ge170q4',
#'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',

}



url = "https://sci.nic.in/php/case_status/case_status_process.php"

CaseType ="7"
CaseNumber="12"
CaseYear="2018"
#s = requests.Session()
login_url= "https://sci.nic.in/php/case_status/case_status_process.php"
login_data="{ct: CaseType, cn: CaseNumber, cy: CaseYear}"
#s.post(login_url, data=login_data)
#verify=False

content = requests.post(url, data=login_data, headers=headers,verify=False)
23 changes: 23 additions & 0 deletions sci/sci.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.27703.2000
MinimumVisualStudioVersion = 10.0.40219.1
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "sci", "sci\sci.pyproj", "{A7FFC072-BA0B-4597-845C-C5532504F2B2}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{A7FFC072-BA0B-4597-845C-C5532504F2B2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{A7FFC072-BA0B-4597-845C-C5532504F2B2}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {96A45083-DCCA-4150-8954-A44A0EB2A0A7}
EndGlobalSection
EndGlobal
66 changes: 66 additions & 0 deletions sci/sci/Multithread_post_case-status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse


class MultiThreadScraper:

def __init__(self, base_url):

self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue()
self.to_crawl.put(self.base_url)

def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)

def scrape_info(self, html):
return

def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)

def scrape_page(self, url):
try:
#test

#test end
res = requests.get(url,verify=False, timeout=(3, 30))
return res
except requests.RequestException as e:
print( e)
return

def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://sci.nic.in/")
s.run_scraper()

75 changes: 75 additions & 0 deletions sci/sci/Sc_display_board.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import logging
import csv
from selenium import webdriver
from urllib.parse import urldefrag, urljoin
from collections import deque
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import json
def get_soup(html):
if html is not None:
soup = BeautifulSoup(html, 'lxml')
return soup
else:
return
## find data
def get_display_board_data(soup):

try:
table = soup.find_all('table', {"class":"board_id"})
#print(table)

#title = soup.find('title').get_text().strip().replace('\n','')
except:
table = None


def get_displayboard_request(url,headers,cookies):
try:


res = requests.post('https://sci.nic.in/php/display/get_board.php', headers=headers, cookies=cookies,verify=False,timeout=(3, 30))
res.raise_for_status()
print(res.content)
#soup=get_soup(res.content)
#table = soup.find('table', id="ctl00_SPWebPartManager1_g_c001c0d9_0cb8_4b0f_b75a_7cc3b6f7d790_ctl00_HistoryData1_gridHistoryData_DataGrid1")
except requests.HTTPError as e:
logging.warning('SC Display return non-200 status code')
raise e
except requests.RequestException as e:
logging.warning('Issue retrieving SC results page')
raise e
except ConnectionError as e:
raise e
else:
return res

def get_sc_displayboard():
sc_display_url='https://sci.nic.in/php/display/get_board.php'
cookies = {
'has_js': '1',
'PHPSESSID': '8j1pgblhj22d21rejvdinvehg1',
}

headers = {
'Origin': 'https://sci.nic.in',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
'Accept': '*/*',
'Referer': 'https://sci.nic.in/display-board',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Content-Length': '0',
}
response=get_displayboard_request(sc_display_url,headers,cookies)
return response

res=get_sc_displayboard()
soup=get_soup(res.content)
display_table=get_display_board_data(soup)
print(display_table)
dis=pd.read_html(res.content,header=0)
print (dis)
130 changes: 130 additions & 0 deletions sci/sci/case_status_call.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import logging
import csv
from selenium import webdriver
from urllib.parse import urldefrag, urljoin
from collections import deque
from bs4 import BeautifulSoup
import requests
import pandas as pd
output_file='scilink3.csv'
class CaseDetails(object):


def __init__(self,court_sub_court_type, case_no,dairy_num,case_party):
self.court_base="SC"
self.court_sub_court_type=court_sub_court_type
self.case_no=case_no
self.case_party=case_party
self.petetinor=''
self.diary_num=''



def csv_output( diary, petetinor):

with open(output_file, 'a', encoding='utf-8') as outputfile:

writer = csv.writer(outputfile)
writer.writerow([diary, petetinor])


#csv_output('testurl', 'test title')
def get_soup(html):
if html is not None:
soup = BeautifulSoup(html, 'lxml')
return soup
else:
return
## find data
def get_data(soup):

try:
title = soup.find('title').get_text().strip().replace('\n','')
except:
title = None

return title



cookies = {
'has_js': '1',
'PHPSESSID': 'bs4d3459o2r7of6fqg7mipmec0',
}

headers = {
'Origin': 'https://sci.nic.in',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*',
'Referer': 'https://sci.nic.in/case-status',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
}


def get_case_data(court_type,case_num,case_year):
# court_type='1'
# case_num='3'
# case_year='2018'

data = [
('ct', court_type),
('cn', case_num),
('cy', case_year),
]
try:
r = requests.post('https://sci.nic.in/php/case_status/case_status_process.php', headers=headers, cookies=cookies, data=data,verify=False,timeout=(3, 30))
print(r.content)
print(r.text)
soup=get_soup(r.content)
a = CaseDetails('1',1,'dairy-123','x vs y');
details =soup.find_all("h5")
# check if case exist if it does not exist return a false and break the loop
case_dne=soup.body.findAll(text='Case Not Found')
if case_dne:
return 0
case_details_pd=pd.read_html(r.content,header=0);
print(case_details_pd)
for item in details:
if("Diary No."in item.text):
a.diary_num=item.text
else:
a.petetinor=item.text

csv_output(a.diary_num, a.petetinor)
print(a.diary_num)
print(a.petetinor)

#case_details_pd.to_csv(output_file, sep='\t', encoding='utf-8')
for i, df in enumerate(case_details_pd):
df.to_csv('myfile_%s.csv' % i,mode='a')
return 1

except requests.exceptions.RequestException as e: # This is the correct syntax
print (e)
return 0
sys.exit(1)


# write a function to scrape the 10 case for each court and every year
# there are 41 court types and 68 years starting from 1950 to 2018
# case number can be any thing from 1-999999
#for testing let us fetchfro case number 1-10 for every court and every year
def scrap_case_details():
print("test")
for court_year in range(2018,2019):
for court_type in range(1,41):
for case_num in range(1,999999):
case_found=get_case_data(court_type,case_num,court_year)
if(case_found==0):
print("case not found ")
break

# fetch the case details
scrap_case_details()
#get_case_data('1','3','2018')


Loading

0 comments on commit 3299e07

Please sign in to comment.