Python Forum

i want to create the resume parser for that i have usme many tool fort that like pyap,fitz,tempfile,spacyand etc like user hitting the our api endpoint with the pdf and Docx at that time the my api should the response the entity's of the resume but it will not work for the person detail so for that i am using may custom logic like the for education and email and contact detail

# Import necessary libraries
from flask import Flask, request, jsonify
from docx import Document
import fitz  # PyMuPDF
import spacy
import traceback
import os
import tempfile
import re
import pyap
import requests

# import usaddress

# Initialize Flask app
app = Flask(__name__)

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to read text from a DOCX file
def read_text_from_docx(docx_path):
    doc = Document(docx_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + '\n'
    return text

# Function to read text from a PDF file
def read_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_doc:
        for page_num in range(pdf_doc.page_count):
            page = pdf_doc[page_num]
            text += page.get_text() + '\n'
    return text

# Function to read text from a TXT file
def read_text_from_txt(txt_path):
    with open(txt_path, 'r') as file:
        text = file.read()
    return text

# Function to extract entities (persons, organizations, locations, dates, emails, phones, and education) from text
def extract_entities(text):
    doc = nlp(text)
    entities = {
        'persons': [ent.text for ent in doc.ents if ent.label_ == 'PERSON'],
         'organizations': [ent.text for ent in doc.ents if ent.label_ == 'ORG'],
        'locations': [ent.text for ent in doc.ents if ent.label_ == 'LOC'],
        'dates': [ent.text for ent in doc.ents if ent.label_ == 'DATE'],
        'emails':extract_emails(text),
        'phones': extract_phone_numbers(text),
        'education': extract_education_nlp(text),
         'Institute': [ent.text for ent in doc.ents if ent.label_ == 'Institute'],
         'INTERNSHIP': [ent.text for ent in doc.ents if ent.label_ == 'INTERNSHIP'],
        'expertise':extract_medical_expertise(text),
         'address':extract_addresses(text) # this is working but the address not  was  not  get single it  will get  multipule address basesed on the resume  uplode  text
      

    }

    return entities

def validate_address_fun(address):
    # Make API call to Bing Maps to validate the address
    # Replace 'YOUR_BING_MAPS_API_KEY' with your actual API key
    api_key = 'My Api Key'
    base_url = 'http://dev.virtualearth.net/REST/v1/Locations'
    params = {
        'q': address,
        'key': api_key,
    }
    response = requests.get(base_url, params=params)
    data = response.json()

    # Check if the address was successfully validated
    if 'resourceSets' in data and data['resourceSets']:
        resources = data['resourceSets'][0]['resources']
        if resources:
            return resources[0]['address']['formattedAddress']

    return None

def extract_addresses(text):
    # Extract all lines from the text
    lines = text.split('\n')

    # Convert the text to lowercase once
    lower_text = text.lower()

    # Iterate through each line and attempt to validate addresses
    validated_addresses = []
    for line in lines:
        # Validate the address using Bing Maps API
        validated_address1 = validate_address_fun(line)

        # Check if the address is not None before converting to lowercase
        if validated_address1 is not None:
            lower_validated_address1 = validated_address1.lower()

            # Check if the lowercase address is found in the lowercase text
            if lower_validated_address1 in lower_text:
                print(validated_address1)
                validated_addresses.append(validated_address1)

    return validated_addresses


def extract_medical_expertise(text):



    expertise_patterns= [
    'Computer Engineer',
    'Software Developer',
    'Web Developer',
    'Systems Analyst',
    'Data Scientist',
    'Machine Learning Engineer',
    'Full Stack Developer',
    'DevOps Engineer',
    'Network Engineer',
    'Database Administrator',
    'Cybersecurity Analyst',
    'C# Developer',
    'Java Developer',
    'Python Developer',
    'HTML/CSS Developer',
    'JavaScript Developer',
    'React.js Developer',
    'Angular Developer',
    'Node.js Developer',
    'SQL Developer',
    'Cloud Engineer',
    'Big Data Engineer',
    'Mobile App Developer',
    'Frontend Developer',
    'Backend Developer','C','C++','Flutter','OOPJ','OOP','API','HTML','CSS','Dart',
    # Add more patterns as needed
]
    doc = nlp(text)
    included_terms = set()  # Set to track included terms
    expertise = []

    for ent in doc.ents:
        if ent.text in expertise_patterns and ent.text not in included_terms:
            expertise.append(ent.text)
            included_terms.add(ent.text)
    return expertise


def extract_emails(text):
    # Use a regular expression to find email addresses
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    return emails
def extract_phone_numbers(text):
    # Use a regular expression to find phone numbers
    phone_pattern = re.compile(r'''
        (?:\+\d{1,4}\s?)?                # Optional country code (e.g., +91, +1)
        (?:\(\d{1,4}\)\s?)?               # Optional area code in parentheses
        \d{10}                            # Main phone number exactly 10 digits
        (?:[.\s-]\d{1,4})?                # Optional extension or additional digits separated by ., space, or -
        \b                                # Word boundary to ensure the number is not part of a larger word
    ''', re.VERBOSE)
    phone_numbers = re.findall(phone_pattern, text)
    return phone_numbers

# Function to extract education details from text
def extract_education_nlp(text):
    doc = nlp(text)
    education_details = []

    # Define patterns for medical education
    medical_education_patterns = [
        'B.SC.(CA & IT)',
        'M.SC.(CA & IT)',
        'B.DC', 'M.SC', 'B.SC', 'BCOM', 'MCOM', 'BCA', 'MCA',
        'HSC', 'SSC', "BACHELOR'S", "MASTER'S", "HIGHER EDUCATION",'CHNA 1','Computer Network Engineer',
        'H.S.C','S.S.C ',
    ]

    education_details1 = []

    # Extract matches for education patterns
    for pattern in medical_education_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            education_details1.extend(matches)

    # Reverse the education details
    # education_details1.reverse()

    years = [int(ent.text) for ent in doc.ents if ent.label_ == 'DATE' and ent.text.isdigit()]
    if years == sorted(years, reverse=False):
        education_details1.reverse()
        
    # Initialize index variable
    i = 0

    # Iterate over entities in the document
    for ent in doc.ents:

        if ent.label_ == 'DATE':
            duration = ent.text
            education_info = {
                'duration': duration,
                'institution': '',
                'degree': '',
                'specialization': '',
                'school': '',
                'result': ''
            }

            # Iterate over tokens in the same sentence
            for token in ent.sent:
                if token.ent_type_ == 'ORG':
                    education_info['institution'] = token.text
                elif token.ent_type_ == 'GPE' and not education_info['institution']:
                    education_info['institution'] = token.text
                elif token.ent_type_ == 'NORP':
                    education_info['school'] = token.text

            # Extract degree information using spaCy's linguistic features
            if ent.root.head.pos_ == 'NOUN' or ent.root.head.pos_ != 'NOUN':
                if education_details1:
                    education_info['degree'] = education_details1[i % len(education_details1)]
                    i += 1

            # Extract result or percentage
            for child in ent.root.children:
                if child.ent_type_ == 'CARDINAL' and '%' in child.text:
                    education_info['result'] = child.text

            # Extract specialization
            if token.ent_type_ == 'PRODUCT':
                education_info['specialization'] = token.text

            education_details.append(education_info)

    return education_details

# Function to read document based on its type (docx, pdf, txt)
def read_document(file, file_type):
    if file_type == 'docx':
        return read_text_from_docx(file)
    elif file_type == 'pdf':
        # Save the uploaded PDF to a temporary file
        temp_file, temp_file_path = tempfile.mkstemp(suffix='.pdf')
        with os.fdopen(temp_file, 'wb') as temp:
            temp.write(file.read())
        text = read_text_from_pdf(temp_file_path)
        os.remove(temp_file_path)  # Remove the temporary file
        return text
    elif file_type == 'txt':
        return read_text_from_txt(file)
    else:
        return "Unsupported file type"

# Endpoint for file upload
@app.route('/upload', methods=['POST'])
def upload_file():
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400

    uploaded_file = request.files['file']

    if uploaded_file.filename == '':
        return jsonify({'error': 'No file selected'}), 400

    file_type = uploaded_file.filename.rsplit('.', 1)[1].lower()

    if file_type not in ['docx', 'pdf', 'txt']:
        return jsonify({'error': 'Unsupported file type'}), 400

    try:
        text = read_document(uploaded_file, file_type)
        entities = extract_entities(text)
        return jsonify({'entities': entities}), 200

    except Exception as e:
        print("Error:", str(e))
        traceback.print_exc()  # Print the full traceback
        return jsonify({'error': str(e)}), 500

# Run the Flask app
if __name__ == '__main__':
    app.run(debug=True)

using this code the code not give me the proper out put for the location person and experience entity and the aoyher are the based on my custom logic

after that i have decide t ouse the ready lib pyresparser like this but the

from flask import Flask, render_template, request, jsonify
from pyresparser import ResumeParser
import spacy
import os
app = Flask(__name__)
# Load spaCy model
custom_nlp = spacy.load('en_core_web_sm')
@app.route('/')
def index():
    return render_template('upload.html')

@app.route('/upload', methods=['POST'])
def upload_file():
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400

    uploaded_file = request.files['file']

    if uploaded_file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    # Save the uploaded file to a specific location on the server
    upload_folder = 'uploads'
    os.makedirs(upload_folder, exist_ok=True)  # Create the folder if it doesn't exist
    file_path = os.path.join(upload_folder, uploaded_file.filename)
    uploaded_file.save(file_path)

    # Parse the resume using pyresparser
    data = ResumeParser(file_path).get_extracted_data()

    # Process the parsed data with spaCy or other libraries as needed
    processed_data = process_data_with_spacy(data)
    print('---------------------------------------------------')
    print(processed_data)

    return jsonify({'processed_data': processed_data}), 200
def process_data_with_spacy(data):
    # print(data)

    all_entities = []

    # Process 'name' field
    if 'name' in data and data['name']:
        name_entities = extract_entities_from_text(data['name'])
        all_entities.extend(name_entities)

    # Process 'email' field
    if 'email' in data and data['email']:
        email_entities = extract_entities_from_text(data['email'])
        all_entities.extend(email_entities)

    # Process 'skills' field
    if 'skills' in data and data['skills']:
        skills_text = ' '.join(data['skills'])
        skills_entities = extract_entities_from_text(skills_text)
        print(skills_text)
        all_entities.extend(skills_entities)

    # Add more fields as needed


    return all_entities

def extract_entities_from_text(text):
    doc = custom_nlp(text)
    entities = [{'text': ent.text, 'label': ent.label_} for ent in doc.ents]
    return entities
if __name__ == '__main__':
    app.run(debug=True)

but this will not return all data from the resume like it will return me name na d sill nad inside the skill it will return me only 2 thigs whil i have many in cv

please rectify this and update this lib or NLP lib for that

Perhaps you could simplify your question? One step at a time? What is the first thing that you can't do?

I understand you want to parse a résumé, which is a CV.

Could you post a very simple example CV and maybe say what you wish to get from it? Everything? Or just important bits?

parthsukhadiya

Pedroski55