pylegal first commit

manusw · Jul 4, 2018 · 3299e07 · 3299e07
commit 3299e07
Show file tree

Hide file tree

Showing 35 changed files with 48,034 additions and 0 deletions.
diff --git a/sci/.vs/sci/v15/.suo b/sci/.vs/sci/v15/.suo
diff --git a/sci/module1.py b/sci/module1.py
@@ -0,0 +1,36 @@
+from lxml import html
+import requests
+from bs4 import BeautifulSoup
+import re
+from datetime import date, timedelta
+import sys
+import json
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+    'X-Requested-With': 'XMLHttpRequest',
+
+'Content-Length': '18',
+#'Cookie' : 'has_js=1; PHPSESSID=tuj8j2vr07fmpq5vc12ge170q4',
+#'Connection': 'keep-alive',
+'Pragma': 'no-cache',
+'Cache-Control': 'no-cache',
+
+}
+
+
+
+url = "https://sci.nic.in/php/case_status/case_status_process.php"
+
+CaseType ="7"
+CaseNumber="12"
+CaseYear="2018"
+#s = requests.Session()
+login_url= "https://sci.nic.in/php/case_status/case_status_process.php"
+login_data="{ct: CaseType, cn: CaseNumber, cy: CaseYear}"
+#s.post(login_url, data=login_data)
+#verify=False
+
+content = requests.post(url, data=login_data, headers=headers,verify=False)
diff --git a/sci/sci.sln b/sci/sci.sln
@@ -0,0 +1,23 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.27703.2000
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "sci", "sci\sci.pyproj", "{A7FFC072-BA0B-4597-845C-C5532504F2B2}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{A7FFC072-BA0B-4597-845C-C5532504F2B2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{A7FFC072-BA0B-4597-845C-C5532504F2B2}.Release|Any CPU.ActiveCfg = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {96A45083-DCCA-4150-8954-A44A0EB2A0A7}
+	EndGlobalSection
+EndGlobal
diff --git a/sci/sci/Multithread_post_case-status.py b/sci/sci/Multithread_post_case-status.py
@@ -0,0 +1,66 @@
+import requests
+from bs4 import BeautifulSoup
+from queue import Queue, Empty
+from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import urljoin, urlparse
+
+
+class MultiThreadScraper:
+
+    def __init__(self, base_url):
+
+        self.base_url = base_url
+        self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
+        self.pool = ThreadPoolExecutor(max_workers=20)
+        self.scraped_pages = set([])
+        self.to_crawl = Queue()
+        self.to_crawl.put(self.base_url)
+
+    def parse_links(self, html):
+        soup = BeautifulSoup(html, 'html.parser')
+        links = soup.find_all('a', href=True)
+        for link in links:
+            url = link['href']
+            if url.startswith('/') or url.startswith(self.root_url):
+                url = urljoin(self.root_url, url)
+                if url not in self.scraped_pages:
+                    self.to_crawl.put(url)
+
+    def scrape_info(self, html):
+        return
+
+    def post_scrape_callback(self, res):
+        result = res.result()
+        if result and result.status_code == 200:
+            self.parse_links(result.text)
+            self.scrape_info(result.text)
+
+    def scrape_page(self, url):
+        try:
+          #test
+
+          #test end
+            res = requests.get(url,verify=False, timeout=(3, 30))
+            return res
+        except requests.RequestException as e:
+            print( e)
+            return
+
+    def run_scraper(self):
+        while True:
+            try:
+                target_url = self.to_crawl.get(timeout=60)
+                if target_url not in self.scraped_pages:
+                    print("Scraping URL: {}".format(target_url))
+                    self.scraped_pages.add(target_url)
+                    job = self.pool.submit(self.scrape_page, target_url)
+                    job.add_done_callback(self.post_scrape_callback)
+            except Empty:
+                return
+            except Exception as e:
+                print(e)
+                continue
+if __name__ == '__main__':
+    s = MultiThreadScraper("https://sci.nic.in/")
+    s.run_scraper()
+
diff --git a/sci/sci/Sc_display_board.py b/sci/sci/Sc_display_board.py
@@ -0,0 +1,75 @@
+import logging
+import csv
+from selenium import webdriver
+from urllib.parse import urldefrag, urljoin
+from collections import deque
+from bs4 import BeautifulSoup
+import requests
+import pandas as pd
+import os
+import json
+def get_soup(html):
+                if html is not None:
+                    soup = BeautifulSoup(html, 'lxml')
+                    return soup
+                else:
+                    return
+        ## find data 
+def get_display_board_data(soup):
+
+    try:
+        table = soup.find_all('table', {"class":"board_id"})
+        #print(table)
+
+    #title = soup.find('title').get_text().strip().replace('\n','')
+    except:
+        table = None
+
+
+def get_displayboard_request(url,headers,cookies):
+    try:
+
+
+        res = requests.post('https://sci.nic.in/php/display/get_board.php', headers=headers, cookies=cookies,verify=False,timeout=(3, 30))
+        res.raise_for_status()
+        print(res.content)
+        #soup=get_soup(res.content)
+        #table = soup.find('table', id="ctl00_SPWebPartManager1_g_c001c0d9_0cb8_4b0f_b75a_7cc3b6f7d790_ctl00_HistoryData1_gridHistoryData_DataGrid1")
+    except requests.HTTPError as e:
+        logging.warning('SC Display return non-200 status code')
+        raise e
+    except requests.RequestException as e:
+        logging.warning('Issue retrieving SC results page')
+        raise e
+    except ConnectionError as e:
+        raise e
+    else:
+        return res
+
+def get_sc_displayboard():
+    sc_display_url='https://sci.nic.in/php/display/get_board.php'
+    cookies = {
+        'has_js': '1',
+        'PHPSESSID': '8j1pgblhj22d21rejvdinvehg1',
+    }
+
+    headers = {
+        'Origin': 'https://sci.nic.in',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
+        'Accept': '*/*',
+        'Referer': 'https://sci.nic.in/display-board',
+        'X-Requested-With': 'XMLHttpRequest',
+        'Connection': 'keep-alive',
+        'Content-Length': '0',
+    }
+    response=get_displayboard_request(sc_display_url,headers,cookies)
+    return response
+
+res=get_sc_displayboard()
+soup=get_soup(res.content)
+display_table=get_display_board_data(soup)
+print(display_table)
+dis=pd.read_html(res.content,header=0)
+print (dis)
diff --git a/sci/sci/case_status_call.py b/sci/sci/case_status_call.py
@@ -0,0 +1,130 @@
+import logging
+import csv
+from selenium import webdriver
+from urllib.parse import urldefrag, urljoin
+from collections import deque
+from bs4 import BeautifulSoup
+import requests
+import pandas as pd
+output_file='scilink3.csv'
+class CaseDetails(object):
+
+
+    def __init__(self,court_sub_court_type, case_no,dairy_num,case_party):
+        self.court_base="SC"
+        self.court_sub_court_type=court_sub_court_type
+        self.case_no=case_no
+        self.case_party=case_party
+        self.petetinor=''
+        self.diary_num=''
+
+
+
+def csv_output( diary, petetinor):
+
+                with open(output_file, 'a', encoding='utf-8') as outputfile:
+
+                    writer = csv.writer(outputfile)
+                    writer.writerow([diary, petetinor])
+
+
+        #csv_output('testurl', 'test title')
+def get_soup(html):
+                if html is not None:
+                    soup = BeautifulSoup(html, 'lxml')
+                    return soup
+                else:
+                    return
+        ## find data 
+def get_data(soup):
+
+                try:
+                    title = soup.find('title').get_text().strip().replace('\n','')
+                except:
+                    title = None
+
+                return title
+
+
+
+cookies = {
+        'has_js': '1',
+        'PHPSESSID': 'bs4d3459o2r7of6fqg7mipmec0',
+        }
+
+headers = {
+        'Origin': 'https://sci.nic.in',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'Accept': '*/*',
+        'Referer': 'https://sci.nic.in/case-status',
+        'X-Requested-With': 'XMLHttpRequest',
+        'Connection': 'keep-alive',
+        }
+
+
+def get_case_data(court_type,case_num,case_year):
+#        court_type='1'
+#        case_num='3'
+#        case_year='2018'
+
+        data = [
+                ('ct', court_type),
+                ('cn', case_num),
+                ('cy', case_year),
+                ]
+        try:
+            r = requests.post('https://sci.nic.in/php/case_status/case_status_process.php', headers=headers, cookies=cookies, data=data,verify=False,timeout=(3, 30))
+            print(r.content)
+            print(r.text)
+            soup=get_soup(r.content)
+            a = CaseDetails('1',1,'dairy-123','x vs y');
+            details =soup.find_all("h5")
+            # check if case exist if it does not exist return a false and break the loop 
+            case_dne=soup.body.findAll(text='Case Not Found')
+            if case_dne:
+              return 0
+            case_details_pd=pd.read_html(r.content,header=0);
+            print(case_details_pd)
+            for item in details:
+                     if("Diary No."in item.text):
+                          a.diary_num=item.text
+                     else:
+                        a.petetinor=item.text
+
+            csv_output(a.diary_num, a.petetinor)
+            print(a.diary_num)
+            print(a.petetinor)
+
+                    #case_details_pd.to_csv(output_file, sep='\t', encoding='utf-8')
+            for i, df in enumerate(case_details_pd):
+                        df.to_csv('myfile_%s.csv' % i,mode='a')   
+            return 1
+
+        except requests.exceptions.RequestException as e:  # This is the correct syntax
+                    print (e)
+                    return 0
+                    sys.exit(1)
+
+
+# write a function to scrape the 10 case for each court and every year 
+# there are 41 court types and 68 years starting from 1950 to 2018
+# case number can be any thing from 1-999999 
+#for testing let us fetchfro case number 1-10 for every court and every year
+def scrap_case_details():
+    print("test")
+    for court_year in range(2018,2019):
+        for court_type in range(1,41):
+            for case_num in range(1,999999):
+                case_found=get_case_data(court_type,case_num,court_year)
+                if(case_found==0):
+                 print("case not found ")
+                 break
+
+# fetch the case details 
+scrap_case_details()
+#get_case_data('1','3','2018')
+
+