Nov-28-2023, 09:05 AM
i want to create the resume parser for that i have usme many tool fort that like pyap,fitz,tempfile,spacyand etc like user hitting the our api endpoint with the pdf and Docx at that time the my api should the response the entity's of the resume but it will not work for the person detail so for that i am using may custom logic like the for education and email and contact detail
after that i have decide t ouse the ready lib pyresparser like this but the
please rectify this and update this lib or NLP lib for that
# Import necessary libraries from flask import Flask, request, jsonify from docx import Document import fitz # PyMuPDF import spacy import traceback import os import tempfile import re import pyap import requests # import usaddress # Initialize Flask app app = Flask(__name__) # Load spaCy English model nlp = spacy.load("en_core_web_sm") # Function to read text from a DOCX file def read_text_from_docx(docx_path): doc = Document(docx_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + '\n' return text # Function to read text from a PDF file def read_text_from_pdf(pdf_path): text = "" with fitz.open(pdf_path) as pdf_doc: for page_num in range(pdf_doc.page_count): page = pdf_doc[page_num] text += page.get_text() + '\n' return text # Function to read text from a TXT file def read_text_from_txt(txt_path): with open(txt_path, 'r') as file: text = file.read() return text # Function to extract entities (persons, organizations, locations, dates, emails, phones, and education) from text def extract_entities(text): doc = nlp(text) entities = { 'persons': [ent.text for ent in doc.ents if ent.label_ == 'PERSON'], 'organizations': [ent.text for ent in doc.ents if ent.label_ == 'ORG'], 'locations': [ent.text for ent in doc.ents if ent.label_ == 'LOC'], 'dates': [ent.text for ent in doc.ents if ent.label_ == 'DATE'], 'emails':extract_emails(text), 'phones': extract_phone_numbers(text), 'education': extract_education_nlp(text), 'Institute': [ent.text for ent in doc.ents if ent.label_ == 'Institute'], 'INTERNSHIP': [ent.text for ent in doc.ents if ent.label_ == 'INTERNSHIP'], 'expertise':extract_medical_expertise(text), 'address':extract_addresses(text) # this is working but the address not was not get single it will get multipule address basesed on the resume uplode text } return entities def validate_address_fun(address): # Make API call to Bing Maps to validate the address # Replace 'YOUR_BING_MAPS_API_KEY' with your actual API key api_key = 'My Api Key' base_url = 'http://dev.virtualearth.net/REST/v1/Locations' params = { 'q': address, 'key': api_key, } response = requests.get(base_url, params=params) data = response.json() # Check if the address was successfully validated if 'resourceSets' in data and data['resourceSets']: resources = data['resourceSets'][0]['resources'] if resources: return resources[0]['address']['formattedAddress'] return None def extract_addresses(text): # Extract all lines from the text lines = text.split('\n') # Convert the text to lowercase once lower_text = text.lower() # Iterate through each line and attempt to validate addresses validated_addresses = [] for line in lines: # Validate the address using Bing Maps API validated_address1 = validate_address_fun(line) # Check if the address is not None before converting to lowercase if validated_address1 is not None: lower_validated_address1 = validated_address1.lower() # Check if the lowercase address is found in the lowercase text if lower_validated_address1 in lower_text: print(validated_address1) validated_addresses.append(validated_address1) return validated_addresses def extract_medical_expertise(text): expertise_patterns= [ 'Computer Engineer', 'Software Developer', 'Web Developer', 'Systems Analyst', 'Data Scientist', 'Machine Learning Engineer', 'Full Stack Developer', 'DevOps Engineer', 'Network Engineer', 'Database Administrator', 'Cybersecurity Analyst', 'C# Developer', 'Java Developer', 'Python Developer', 'HTML/CSS Developer', 'JavaScript Developer', 'React.js Developer', 'Angular Developer', 'Node.js Developer', 'SQL Developer', 'Cloud Engineer', 'Big Data Engineer', 'Mobile App Developer', 'Frontend Developer', 'Backend Developer','C','C++','Flutter','OOPJ','OOP','API','HTML','CSS','Dart', # Add more patterns as needed ] doc = nlp(text) included_terms = set() # Set to track included terms expertise = [] for ent in doc.ents: if ent.text in expertise_patterns and ent.text not in included_terms: expertise.append(ent.text) included_terms.add(ent.text) return expertise def extract_emails(text): # Use a regular expression to find email addresses email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' emails = re.findall(email_pattern, text) return emails def extract_phone_numbers(text): # Use a regular expression to find phone numbers phone_pattern = re.compile(r''' (?:\+\d{1,4}\s?)? # Optional country code (e.g., +91, +1) (?:\(\d{1,4}\)\s?)? # Optional area code in parentheses \d{10} # Main phone number exactly 10 digits (?:[.\s-]\d{1,4})? # Optional extension or additional digits separated by ., space, or - \b # Word boundary to ensure the number is not part of a larger word ''', re.VERBOSE) phone_numbers = re.findall(phone_pattern, text) return phone_numbers # Function to extract education details from text def extract_education_nlp(text): doc = nlp(text) education_details = [] # Define patterns for medical education medical_education_patterns = [ 'B.SC.(CA & IT)', 'M.SC.(CA & IT)', 'B.DC', 'M.SC', 'B.SC', 'BCOM', 'MCOM', 'BCA', 'MCA', 'HSC', 'SSC', "BACHELOR'S", "MASTER'S", "HIGHER EDUCATION",'CHNA 1','Computer Network Engineer', 'H.S.C','S.S.C ', ] education_details1 = [] # Extract matches for education patterns for pattern in medical_education_patterns: matches = re.findall(pattern, text, re.IGNORECASE) if matches: education_details1.extend(matches) # Reverse the education details # education_details1.reverse() years = [int(ent.text) for ent in doc.ents if ent.label_ == 'DATE' and ent.text.isdigit()] if years == sorted(years, reverse=False): education_details1.reverse() # Initialize index variable i = 0 # Iterate over entities in the document for ent in doc.ents: if ent.label_ == 'DATE': duration = ent.text education_info = { 'duration': duration, 'institution': '', 'degree': '', 'specialization': '', 'school': '', 'result': '' } # Iterate over tokens in the same sentence for token in ent.sent: if token.ent_type_ == 'ORG': education_info['institution'] = token.text elif token.ent_type_ == 'GPE' and not education_info['institution']: education_info['institution'] = token.text elif token.ent_type_ == 'NORP': education_info['school'] = token.text # Extract degree information using spaCy's linguistic features if ent.root.head.pos_ == 'NOUN' or ent.root.head.pos_ != 'NOUN': if education_details1: education_info['degree'] = education_details1[i % len(education_details1)] i += 1 # Extract result or percentage for child in ent.root.children: if child.ent_type_ == 'CARDINAL' and '%' in child.text: education_info['result'] = child.text # Extract specialization if token.ent_type_ == 'PRODUCT': education_info['specialization'] = token.text education_details.append(education_info) return education_details # Function to read document based on its type (docx, pdf, txt) def read_document(file, file_type): if file_type == 'docx': return read_text_from_docx(file) elif file_type == 'pdf': # Save the uploaded PDF to a temporary file temp_file, temp_file_path = tempfile.mkstemp(suffix='.pdf') with os.fdopen(temp_file, 'wb') as temp: temp.write(file.read()) text = read_text_from_pdf(temp_file_path) os.remove(temp_file_path) # Remove the temporary file return text elif file_type == 'txt': return read_text_from_txt(file) else: return "Unsupported file type" # Endpoint for file upload @app.route('/upload', methods=['POST']) def upload_file(): if 'file' not in request.files: return jsonify({'error': 'No file provided'}), 400 uploaded_file = request.files['file'] if uploaded_file.filename == '': return jsonify({'error': 'No file selected'}), 400 file_type = uploaded_file.filename.rsplit('.', 1)[1].lower() if file_type not in ['docx', 'pdf', 'txt']: return jsonify({'error': 'Unsupported file type'}), 400 try: text = read_document(uploaded_file, file_type) entities = extract_entities(text) return jsonify({'entities': entities}), 200 except Exception as e: print("Error:", str(e)) traceback.print_exc() # Print the full traceback return jsonify({'error': str(e)}), 500 # Run the Flask app if __name__ == '__main__': app.run(debug=True)using this code the code not give me the proper out put for the location person and experience entity and the aoyher are the based on my custom logic
after that i have decide t ouse the ready lib pyresparser like this but the
from flask import Flask, render_template, request, jsonify from pyresparser import ResumeParser import spacy import os app = Flask(__name__) # Load spaCy model custom_nlp = spacy.load('en_core_web_sm') @app.route('/') def index(): return render_template('upload.html') @app.route('/upload', methods=['POST']) def upload_file(): if 'file' not in request.files: return jsonify({'error': 'No file provided'}), 400 uploaded_file = request.files['file'] if uploaded_file.filename == '': return jsonify({'error': 'No selected file'}), 400 # Save the uploaded file to a specific location on the server upload_folder = 'uploads' os.makedirs(upload_folder, exist_ok=True) # Create the folder if it doesn't exist file_path = os.path.join(upload_folder, uploaded_file.filename) uploaded_file.save(file_path) # Parse the resume using pyresparser data = ResumeParser(file_path).get_extracted_data() # Process the parsed data with spaCy or other libraries as needed processed_data = process_data_with_spacy(data) print('---------------------------------------------------') print(processed_data) return jsonify({'processed_data': processed_data}), 200 def process_data_with_spacy(data): # print(data) all_entities = [] # Process 'name' field if 'name' in data and data['name']: name_entities = extract_entities_from_text(data['name']) all_entities.extend(name_entities) # Process 'email' field if 'email' in data and data['email']: email_entities = extract_entities_from_text(data['email']) all_entities.extend(email_entities) # Process 'skills' field if 'skills' in data and data['skills']: skills_text = ' '.join(data['skills']) skills_entities = extract_entities_from_text(skills_text) print(skills_text) all_entities.extend(skills_entities) # Add more fields as needed return all_entities def extract_entities_from_text(text): doc = custom_nlp(text) entities = [{'text': ent.text, 'label': ent.label_} for ent in doc.ents] return entities if __name__ == '__main__': app.run(debug=True)but this will not return all data from the resume like it will return me name na d sill nad inside the skill it will return me only 2 thigs whil i have many in cv
please rectify this and update this lib or NLP lib for that