-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 3299e07
Showing
35 changed files
with
48,034 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from lxml import html | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import re | ||
from datetime import date, timedelta | ||
import sys | ||
import json | ||
|
||
headers = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0', | ||
'Accept-Encoding': 'gzip, deflate, br', | ||
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', | ||
'X-Requested-With': 'XMLHttpRequest', | ||
|
||
'Content-Length': '18', | ||
#'Cookie' : 'has_js=1; PHPSESSID=tuj8j2vr07fmpq5vc12ge170q4', | ||
#'Connection': 'keep-alive', | ||
'Pragma': 'no-cache', | ||
'Cache-Control': 'no-cache', | ||
|
||
} | ||
|
||
|
||
|
||
url = "https://sci.nic.in/php/case_status/case_status_process.php" | ||
|
||
CaseType ="7" | ||
CaseNumber="12" | ||
CaseYear="2018" | ||
#s = requests.Session() | ||
login_url= "https://sci.nic.in/php/case_status/case_status_process.php" | ||
login_data="{ct: CaseType, cn: CaseNumber, cy: CaseYear}" | ||
#s.post(login_url, data=login_data) | ||
#verify=False | ||
|
||
content = requests.post(url, data=login_data, headers=headers,verify=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
|
||
Microsoft Visual Studio Solution File, Format Version 12.00 | ||
# Visual Studio 15 | ||
VisualStudioVersion = 15.0.27703.2000 | ||
MinimumVisualStudioVersion = 10.0.40219.1 | ||
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "sci", "sci\sci.pyproj", "{A7FFC072-BA0B-4597-845C-C5532504F2B2}" | ||
EndProject | ||
Global | ||
GlobalSection(SolutionConfigurationPlatforms) = preSolution | ||
Debug|Any CPU = Debug|Any CPU | ||
Release|Any CPU = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(ProjectConfigurationPlatforms) = postSolution | ||
{A7FFC072-BA0B-4597-845C-C5532504F2B2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{A7FFC072-BA0B-4597-845C-C5532504F2B2}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(SolutionProperties) = preSolution | ||
HideSolutionNode = FALSE | ||
EndGlobalSection | ||
GlobalSection(ExtensibilityGlobals) = postSolution | ||
SolutionGuid = {96A45083-DCCA-4150-8954-A44A0EB2A0A7} | ||
EndGlobalSection | ||
EndGlobal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from queue import Queue, Empty | ||
from concurrent.futures import ThreadPoolExecutor | ||
from urllib.parse import urljoin, urlparse | ||
|
||
|
||
class MultiThreadScraper: | ||
|
||
def __init__(self, base_url): | ||
|
||
self.base_url = base_url | ||
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc) | ||
self.pool = ThreadPoolExecutor(max_workers=20) | ||
self.scraped_pages = set([]) | ||
self.to_crawl = Queue() | ||
self.to_crawl.put(self.base_url) | ||
|
||
def parse_links(self, html): | ||
soup = BeautifulSoup(html, 'html.parser') | ||
links = soup.find_all('a', href=True) | ||
for link in links: | ||
url = link['href'] | ||
if url.startswith('/') or url.startswith(self.root_url): | ||
url = urljoin(self.root_url, url) | ||
if url not in self.scraped_pages: | ||
self.to_crawl.put(url) | ||
|
||
def scrape_info(self, html): | ||
return | ||
|
||
def post_scrape_callback(self, res): | ||
result = res.result() | ||
if result and result.status_code == 200: | ||
self.parse_links(result.text) | ||
self.scrape_info(result.text) | ||
|
||
def scrape_page(self, url): | ||
try: | ||
#test | ||
|
||
#test end | ||
res = requests.get(url,verify=False, timeout=(3, 30)) | ||
return res | ||
except requests.RequestException as e: | ||
print( e) | ||
return | ||
|
||
def run_scraper(self): | ||
while True: | ||
try: | ||
target_url = self.to_crawl.get(timeout=60) | ||
if target_url not in self.scraped_pages: | ||
print("Scraping URL: {}".format(target_url)) | ||
self.scraped_pages.add(target_url) | ||
job = self.pool.submit(self.scrape_page, target_url) | ||
job.add_done_callback(self.post_scrape_callback) | ||
except Empty: | ||
return | ||
except Exception as e: | ||
print(e) | ||
continue | ||
if __name__ == '__main__': | ||
s = MultiThreadScraper("https://sci.nic.in/") | ||
s.run_scraper() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import logging | ||
import csv | ||
from selenium import webdriver | ||
from urllib.parse import urldefrag, urljoin | ||
from collections import deque | ||
from bs4 import BeautifulSoup | ||
import requests | ||
import pandas as pd | ||
import os | ||
import json | ||
def get_soup(html): | ||
if html is not None: | ||
soup = BeautifulSoup(html, 'lxml') | ||
return soup | ||
else: | ||
return | ||
## find data | ||
def get_display_board_data(soup): | ||
|
||
try: | ||
table = soup.find_all('table', {"class":"board_id"}) | ||
#print(table) | ||
|
||
#title = soup.find('title').get_text().strip().replace('\n','') | ||
except: | ||
table = None | ||
|
||
|
||
def get_displayboard_request(url,headers,cookies): | ||
try: | ||
|
||
|
||
res = requests.post('https://sci.nic.in/php/display/get_board.php', headers=headers, cookies=cookies,verify=False,timeout=(3, 30)) | ||
res.raise_for_status() | ||
print(res.content) | ||
#soup=get_soup(res.content) | ||
#table = soup.find('table', id="ctl00_SPWebPartManager1_g_c001c0d9_0cb8_4b0f_b75a_7cc3b6f7d790_ctl00_HistoryData1_gridHistoryData_DataGrid1") | ||
except requests.HTTPError as e: | ||
logging.warning('SC Display return non-200 status code') | ||
raise e | ||
except requests.RequestException as e: | ||
logging.warning('Issue retrieving SC results page') | ||
raise e | ||
except ConnectionError as e: | ||
raise e | ||
else: | ||
return res | ||
|
||
def get_sc_displayboard(): | ||
sc_display_url='https://sci.nic.in/php/display/get_board.php' | ||
cookies = { | ||
'has_js': '1', | ||
'PHPSESSID': '8j1pgblhj22d21rejvdinvehg1', | ||
} | ||
|
||
headers = { | ||
'Origin': 'https://sci.nic.in', | ||
'Accept-Encoding': 'gzip, deflate, br', | ||
'Accept-Language': 'en-US,en;q=0.9', | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', | ||
'Accept': '*/*', | ||
'Referer': 'https://sci.nic.in/display-board', | ||
'X-Requested-With': 'XMLHttpRequest', | ||
'Connection': 'keep-alive', | ||
'Content-Length': '0', | ||
} | ||
response=get_displayboard_request(sc_display_url,headers,cookies) | ||
return response | ||
|
||
res=get_sc_displayboard() | ||
soup=get_soup(res.content) | ||
display_table=get_display_board_data(soup) | ||
print(display_table) | ||
dis=pd.read_html(res.content,header=0) | ||
print (dis) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
import logging | ||
import csv | ||
from selenium import webdriver | ||
from urllib.parse import urldefrag, urljoin | ||
from collections import deque | ||
from bs4 import BeautifulSoup | ||
import requests | ||
import pandas as pd | ||
output_file='scilink3.csv' | ||
class CaseDetails(object): | ||
|
||
|
||
def __init__(self,court_sub_court_type, case_no,dairy_num,case_party): | ||
self.court_base="SC" | ||
self.court_sub_court_type=court_sub_court_type | ||
self.case_no=case_no | ||
self.case_party=case_party | ||
self.petetinor='' | ||
self.diary_num='' | ||
|
||
|
||
|
||
def csv_output( diary, petetinor): | ||
|
||
with open(output_file, 'a', encoding='utf-8') as outputfile: | ||
|
||
writer = csv.writer(outputfile) | ||
writer.writerow([diary, petetinor]) | ||
|
||
|
||
#csv_output('testurl', 'test title') | ||
def get_soup(html): | ||
if html is not None: | ||
soup = BeautifulSoup(html, 'lxml') | ||
return soup | ||
else: | ||
return | ||
## find data | ||
def get_data(soup): | ||
|
||
try: | ||
title = soup.find('title').get_text().strip().replace('\n','') | ||
except: | ||
title = None | ||
|
||
return title | ||
|
||
|
||
|
||
cookies = { | ||
'has_js': '1', | ||
'PHPSESSID': 'bs4d3459o2r7of6fqg7mipmec0', | ||
} | ||
|
||
headers = { | ||
'Origin': 'https://sci.nic.in', | ||
'Accept-Encoding': 'gzip, deflate, br', | ||
'Accept-Language': 'en-US,en;q=0.9', | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', | ||
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', | ||
'Accept': '*/*', | ||
'Referer': 'https://sci.nic.in/case-status', | ||
'X-Requested-With': 'XMLHttpRequest', | ||
'Connection': 'keep-alive', | ||
} | ||
|
||
|
||
def get_case_data(court_type,case_num,case_year): | ||
# court_type='1' | ||
# case_num='3' | ||
# case_year='2018' | ||
|
||
data = [ | ||
('ct', court_type), | ||
('cn', case_num), | ||
('cy', case_year), | ||
] | ||
try: | ||
r = requests.post('https://sci.nic.in/php/case_status/case_status_process.php', headers=headers, cookies=cookies, data=data,verify=False,timeout=(3, 30)) | ||
print(r.content) | ||
print(r.text) | ||
soup=get_soup(r.content) | ||
a = CaseDetails('1',1,'dairy-123','x vs y'); | ||
details =soup.find_all("h5") | ||
# check if case exist if it does not exist return a false and break the loop | ||
case_dne=soup.body.findAll(text='Case Not Found') | ||
if case_dne: | ||
return 0 | ||
case_details_pd=pd.read_html(r.content,header=0); | ||
print(case_details_pd) | ||
for item in details: | ||
if("Diary No."in item.text): | ||
a.diary_num=item.text | ||
else: | ||
a.petetinor=item.text | ||
|
||
csv_output(a.diary_num, a.petetinor) | ||
print(a.diary_num) | ||
print(a.petetinor) | ||
|
||
#case_details_pd.to_csv(output_file, sep='\t', encoding='utf-8') | ||
for i, df in enumerate(case_details_pd): | ||
df.to_csv('myfile_%s.csv' % i,mode='a') | ||
return 1 | ||
|
||
except requests.exceptions.RequestException as e: # This is the correct syntax | ||
print (e) | ||
return 0 | ||
sys.exit(1) | ||
|
||
|
||
# write a function to scrape the 10 case for each court and every year | ||
# there are 41 court types and 68 years starting from 1950 to 2018 | ||
# case number can be any thing from 1-999999 | ||
#for testing let us fetchfro case number 1-10 for every court and every year | ||
def scrap_case_details(): | ||
print("test") | ||
for court_year in range(2018,2019): | ||
for court_type in range(1,41): | ||
for case_num in range(1,999999): | ||
case_found=get_case_data(court_type,case_num,court_year) | ||
if(case_found==0): | ||
print("case not found ") | ||
break | ||
|
||
# fetch the case details | ||
scrap_case_details() | ||
#get_case_data('1','3','2018') | ||
|
||
|
Oops, something went wrong.