From 68e1ced161b12526179e0912488e6330ebbb5b93 Mon Sep 17 00:00:00 2001 From: hoangnv170752 Date: Sun, 26 Jan 2025 19:14:27 +0700 Subject: [PATCH] add: dockerfile --- Dockerfile | 13 ++++ app/routes.py | 189 ++++++++++++++++++++++++----------------------- requirements.txt | 1 + run.py | 4 +- 4 files changed, 112 insertions(+), 95 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6c45dd7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.9-slim + +WORKDIR /app + +COPY requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 5002 + +CMD ["python", "run.py"] \ No newline at end of file diff --git a/app/routes.py b/app/routes.py index 50c2b34..f75cf96 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,4 +1,5 @@ from flask import Blueprint, request, jsonify, current_app +from flask_restx import Api, Resource, fields from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd @@ -7,107 +8,109 @@ from pptx import Presentation # To handle PowerPoint files from app.chatbot import ask_groq +# Initialize Flask Blueprint and API main = Blueprint('main', __name__) +api = Api(main, version="1.0", title="File Processor API", description="API for uploading and querying files with AI support") # Dictionary to store uploaded content in memory uploaded_file_content = {} vectorizer = TfidfVectorizer(stop_words='english') -@main.route('/upload', methods=['POST']) -def upload_file(): - """ - Endpoint to upload a file (PDF, PPT, or TXT) and extract its content. - """ - if 'file' not in request.files: - return jsonify({'error': 'No file part in the request'}), 400 - - file = request.files['file'] - if file.filename == '': - return jsonify({'error': 'No file selected'}), 400 - - # Check file extension - allowed_extensions = ['.pdf', '.pptx', '.txt'] - file_ext = os.path.splitext(file.filename)[1].lower() - - if file and file_ext in allowed_extensions: - upload_folder = current_app.config['UPLOAD_FOLDER'] - os.makedirs(upload_folder, exist_ok=True) - filepath = os.path.join(upload_folder, file.filename) - file.save(filepath) +# Define request and response models for Swagger +upload_model = api.model("UploadModel", { + "file": fields.String(required=True, description="The file to upload (PDF, PPTX, or TXT)") +}) + +chat_model = api.model("ChatModel", { + "question": fields.String(required=True, description="The question to ask"), + "filename": fields.String(required=True, description="The filename of the uploaded document to query") +}) + + +@api.route('/upload') +class UploadFile(Resource): + @api.doc(description="Endpoint to upload a file (PDF, PPTX, or TXT) and extract its content.") + @api.expect(upload_model, validate=False) + def post(self): + if 'file' not in request.files: + return jsonify({'error': 'No file part in the request'}), 400 + + file = request.files['file'] + if file.filename == '': + return jsonify({'error': 'No file selected'}), 400 + + allowed_extensions = ['.pdf', '.pptx', '.txt'] + file_ext = os.path.splitext(file.filename)[1].lower() + + if file and file_ext in allowed_extensions: + upload_folder = current_app.config['UPLOAD_FOLDER'] + os.makedirs(upload_folder, exist_ok=True) + filepath = os.path.join(upload_folder, file.filename) + file.save(filepath) + + try: + if file_ext == '.pdf': + with pdfplumber.open(filepath) as pdf: + text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()]) + elif file_ext == '.pptx': + ppt = Presentation(filepath) + text = "\n".join([paragraph.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text_frame") for paragraph in shape.text_frame.paragraphs]) + elif file_ext == '.txt': + with open(filepath, 'r', encoding='utf-8') as txt_file: + text = txt_file.read() + + uploaded_file_content[file.filename] = text + return jsonify({'message': 'File uploaded successfully', 'filename': file.filename, 'content': text}), 200 + + except Exception as e: + return jsonify({'error': f"Failed to process the file: {str(e)}"}), 500 + + return jsonify({'error': f'Invalid file type, only {allowed_extensions} are allowed'}), 400 + + +@api.route('/chat') +class Chat(Resource): + @api.doc(description="Endpoint to handle chatbot queries based on uploaded files.") + @api.expect(chat_model, validate=True) + def post(self): + data = request.json + question = data.get('question') + filename = data.get('filename') + + if not question or not filename: + return jsonify({"error": "Both 'question' and 'filename' are required"}), 400 + + file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) + if not os.path.exists(file_path): + return jsonify({"error": f"File '{filename}' not found"}), 404 try: - if file_ext == '.pdf': - # Extract text from PDF - with pdfplumber.open(filepath) as pdf: - text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()]) - elif file_ext == '.pptx': - # Extract text from PPT - ppt = Presentation(filepath) - text = "\n".join([paragraph.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text_frame") for paragraph in shape.text_frame.paragraphs]) - elif file_ext == '.txt': - # Extract text from TXT - with open(filepath, 'r', encoding='utf-8') as txt_file: - text = txt_file.read() - - # Store the content in memory for future use - uploaded_file_content[file.filename] = text - - return jsonify({'message': 'File uploaded successfully', 'filename': file.filename, 'content':text}), 200 - - except Exception as e: - return jsonify({'error': f"Failed to process the file: {str(e)}"}), 500 - - return jsonify({'error': f'Invalid file type, only {allowed_extensions} are allowed'}), 400 - - -@main.route('/chat', methods=['POST']) -def chat(): - """ - Endpoint to handle chatbot queries. - Expects JSON input with 'question' and 'filename' keys. - """ - data = request.json - question = data.get('question') - filename = data.get('filename') - - if not question or not filename: - return jsonify({"error": "Both 'question' and 'filename' are required"}), 400 - - # Retrieve the content of the uploaded file from the file itself - file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) - if not os.path.exists(file_path): - return jsonify({"error": f"File '{filename}' not found"}), 404 - - try: - # Check if content is already in memory - if filename in uploaded_file_content: - content = uploaded_file_content[filename] - else: - # Reload content from the file if not in memory - file_ext = os.path.splitext(filename)[1].lower() - if file_ext == '.pdf': - with pdfplumber.open(file_path) as pdf: - content = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()]) - elif file_ext == '.pptx': - ppt = Presentation(file_path) - content = "\n".join([paragraph.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text_frame") for paragraph in shape.text_frame.paragraphs]) - elif file_ext == '.txt': - with open(file_path, 'r', encoding='utf-8') as txt_file: - content = txt_file.read() + if filename in uploaded_file_content: + content = uploaded_file_content[filename] else: - return jsonify({'error': 'Unsupported file type'}), 400 + file_ext = os.path.splitext(filename)[1].lower() + if file_ext == '.pdf': + with pdfplumber.open(file_path) as pdf: + content = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()]) + elif file_ext == '.pptx': + ppt = Presentation(file_path) + content = "\n".join([paragraph.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text_frame") for paragraph in shape.text_frame.paragraphs]) + elif file_ext == '.txt': + with open(file_path, 'r', encoding='utf-8') as txt_file: + content = txt_file.read() + else: + return jsonify({'error': 'Unsupported file type'}), 400 + + groq_response = ask_groq(question, content) + + if "error" in groq_response: + return jsonify({"error": groq_response["error"]}), 500 + + return jsonify({ + "answer": groq_response.get("answer"), + "confidence": groq_response.get("confidence") + }) - # Use the chatbot function to get an answer - groq_response = ask_groq(question, content) - - if "error" in groq_response: - return jsonify({"error": groq_response["error"]}), 500 - - return jsonify({ - "answer": groq_response.get("answer"), - "confidence": groq_response.get("confidence") - }) - - except Exception as e: - return jsonify({'error': f"Failed to process the file: {str(e)}"}), 500 \ No newline at end of file + except Exception as e: + return jsonify({'error': f"Failed to process the file: {str(e)}"}), 500 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f6c9f8c..f1f387f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ python-pptx numpy pandas scikit-learn +flask-restx \ No newline at end of file diff --git a/run.py b/run.py index a3fdaf3..bfaba44 100644 --- a/run.py +++ b/run.py @@ -2,5 +2,5 @@ app = create_app() -if __name__ == '__main__': - app.run(debug=True) +if __name__ == "__main__": + app.run(debug=True, host='0.0.0.0', port=5002)