diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6c45dd7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.9-slim + +WORKDIR /app + +COPY requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 5002 + +CMD ["python", "run.py"] \ No newline at end of file diff --git a/app/routes.py b/app/routes.py index c6ee25a..e60bb1e 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,4 +1,5 @@ from flask import Blueprint, request, jsonify, current_app +from flask_restx import Api, Resource, fields from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd @@ -7,19 +8,49 @@ from pptx import Presentation # To handle PowerPoint files from app.chatbot import ask_groq +# Initialize Flask Blueprint and API main = Blueprint('main', __name__) +api = Api(main, version="1.0", title="File Processor API", description="API for uploading and querying files with AI support") # Dictionary to store uploaded content in memory uploaded_file_content = {} vectorizer = TfidfVectorizer(stop_words='english') +# Define request and response models for Swagger +upload_model = api.model("UploadModel", { + "file": fields.String(required=True, description="The file to upload (PDF, PPTX, or TXT)") +}) + +chat_model = api.model("ChatModel", { + "question": fields.String(required=True, description="The question to ask"), + "filename": fields.String(required=True, description="The filename of the uploaded document to query") +}) + +@api.route('/chat') +class Chat(Resource): + @api.doc(description="Endpoint to handle chatbot queries based on uploaded files.") + @api.expect(chat_model, validate=True) + def post(self): + data = request.json + question = data.get('question') + filename = data.get('filename') + + if not question or not filename: + return jsonify({"error": "Both 'question' and 'filename' are required"}), 400 + + file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) + if not os.path.exists(file_path): + return jsonify({"error": f"File '{filename}' not found"}), 404 + @main.route('/', methods=['GET']) def health_check(): return 'Server is running!' @main.route('/upload', methods=['POST']) def upload_file(): + @api.doc(description="Endpoint to upload a file (PDF, PPTX, or TXT) and extract its content.") + @api.expect(upload_model, validate=False) """ Endpoint to upload a file (PDF, PPT, or TXT) and extract its content. """ @@ -41,67 +72,34 @@ def upload_file(): file.save(filepath) try: - if file_ext == '.pdf': - # Extract text from PDF - with pdfplumber.open(filepath) as pdf: - text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()]) - elif file_ext == '.pptx': - # Extract text from PPT - ppt = Presentation(filepath) - text = "\n".join([paragraph.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text_frame") for paragraph in shape.text_frame.paragraphs]) - elif file_ext == '.txt': - # Extract text from TXT - with open(filepath, 'r', encoding='utf-8') as txt_file: - text = txt_file.read() - - # Store the content in memory for future use - uploaded_file_content[file.filename] = text - - return jsonify({'message': 'File uploaded successfully', 'filename': file.filename, 'content':text}), 200 + if filename in uploaded_file_content: + content = uploaded_file_content[filename] + else: + file_ext = os.path.splitext(filename)[1].lower() + if file_ext == '.pdf': + with pdfplumber.open(file_path) as pdf: + content = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()]) + elif file_ext == '.pptx': + ppt = Presentation(file_path) + content = "\n".join([paragraph.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text_frame") for paragraph in shape.text_frame.paragraphs]) + elif file_ext == '.txt': + with open(file_path, 'r', encoding='utf-8') as txt_file: + content = txt_file.read() + else: + return jsonify({'error': 'Unsupported file type'}), 400 + + groq_response = ask_groq(question, content) + + if "error" in groq_response: + return jsonify({"error": groq_response["error"]}), 500 + + return jsonify({ + "answer": groq_response.get("answer"), + "confidence": groq_response.get("confidence") + }) except Exception as e: return jsonify({'error': f"Failed to process the file: {str(e)}"}), 500 - - return jsonify({'error': f'Invalid file type, only {allowed_extensions} are allowed'}), 400 - - -@main.route('/chat', methods=['POST']) -def chat(): - """ - Endpoint to handle chatbot queries. - Expects JSON input with 'question' and 'filename' keys. - """ - data = request.json - question = data.get('question') - filename = data.get('filename') - - if not question or not filename: - return jsonify({"error": "Both 'question' and 'filename' are required"}), 400 - - # Retrieve the content of the uploaded file from the file itself - file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) - if not os.path.exists(file_path): - return jsonify({"error": f"File '{filename}' not found"}), 404 - - try: - # Check if content is already in memory - if filename in uploaded_file_content: - content = uploaded_file_content[filename] - else: - # Reload content from the file if not in memory - file_ext = os.path.splitext(filename)[1].lower() - if file_ext == '.pdf': - with pdfplumber.open(file_path) as pdf: - content = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()]) - elif file_ext == '.pptx': - ppt = Presentation(file_path) - content = "\n".join([paragraph.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text_frame") for paragraph in shape.text_frame.paragraphs]) - elif file_ext == '.txt': - with open(file_path, 'r', encoding='utf-8') as txt_file: - content = txt_file.read() - else: - return jsonify({'error': 'Unsupported file type'}), 400 - # Use the chatbot function to get an answer groq_response = ask_groq(question, content) @@ -151,4 +149,4 @@ def recommend_based_on_bandwidth(): filtered_documents.append(doc) except Exception as e: - return jsonify({'error': str(e)}), 500 \ No newline at end of file + return jsonify({'error': str(e)}), 500 diff --git a/requirements.txt b/requirements.txt index 239485a..e23d847 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ python-pptx numpy pandas scikit-learn +flask-restx gunicorn diff --git a/run.py b/run.py index a3fdaf3..bfaba44 100644 --- a/run.py +++ b/run.py @@ -2,5 +2,5 @@ app = create_app() -if __name__ == '__main__': - app.run(debug=True) +if __name__ == "__main__": + app.run(debug=True, host='0.0.0.0', port=5002)