{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import random, os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = [random.randint(1,99) for x in range(150)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start.sort()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(start)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "end = [start[x]+random.randint(10,51) for x in range(150)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(end)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "chr = [1 for x in range(150)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(chr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"test.bed\",\"w\") as f: \n", " for x in range(150): \n", " f.write(\"%s\\t%s\\t%s\\n\" % (chr[x], start[x], end[x]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import glob" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "glob.glob(\"*\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "############# Exercice 1 ###########################" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f = open(\"test.bed\",\"r\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bed = list()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for l in f: \n", " bed.append(l.rstrip('\\n').split('\\t'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def load_table(f,d):\n", " out = []\n", " with open(f,\"r\") as ff:\n", " for l in ff:\n", " out.append(l.rstrip('\\n').split(d))\n", " return out" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "print(load_table(\"test.bed\",'\\t'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "print(bed)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bed_csv = list()\n", "with open(\"test.bed\", \"r\") as ff:\n", " reader = csv.reader(ff,delimiter='\\t')\n", " for line in reader:\n", " bed_csv.append(line)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "print(bed_csv)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for line in bed:\n", " line[0] = 'chr'+line[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(bed)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#[i for i in bed if int(i[2])-int(i[1])>50]\n", "for i in bed:\n", " if int(i[2])-int(i[1])>50:\n", " print(i)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def longer(f,l):\n", " out = []\n", " for i in f:\n", " if int(i[2])-int(i[1])>l:\n", " out.append(i)\n", " return out" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(longer(bed,50))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "############# Exercice 2 ###########################" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wig = dict()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for line in bed:\n", " for bp in range(int(line[1]),int(line[2])+1):\n", " if bp in wig:\n", " wig[bp] = wig[bp]+1\n", " else:\n", " wig[bp] = 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(wig)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"test.wig\",\"w\") as f:\n", " f.write(\"variableStep chrom=%s\\n\" % bed[0][0])\n", " for i,x in wig.items():\n", " f.write(\"%s\\t%s\\n\" % (i,x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "############# Exercice 3 ###########################" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sam = load_table(\"test.sam\",\"\\t\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(sam[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mapped = []\n", "for l in sam:\n", " if int(l[1]) & 4 == 0:\n", " mapped.append(l)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(sam)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(mapped)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "len([l for l in sam if int(l[1]) & 2 != 0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def bitflag(table,flags):\n", " out = []\n", " for l in table:\n", " ok = True\n", " for c in flags:\n", " if int(l[1]) & c == 0:\n", " ok = False\n", " break\n", " if ok:\n", " out.append(l)\n", " return out" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(bitflag(sam,[]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "############# Homework ###########################" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fd = open(\"test.fastq\",\"r\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "i = 0\n", "fastq = []\n", "line = []\n", "\n", "# read the input file and store it as a list of 4 columns (0: header, 1: sequence, 2: '+', 3: qual. sequence)\n", "for l in fd:\n", " line.append(l.rstrip('\\n\\r'))\n", " if i%4 == 3:\n", " fastq.append(line)\n", " line=[]\n", " i = i+1\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(fastq)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def cut_sequence(sq, min_mean = 30, ws = 10):\n", " # sq : quality sequence\n", " # ws = 10 : window size\n", " # min_mean = 30 # mean quality threshold\n", " w = 0\n", " while w <= len(sq)-ws:\n", " # read part of qual. sequence (window sequence)\n", " window = sq[w:w+ws] \n", " \n", " # get mean quality of window sequence\n", " for c in window: # compute per-base quality and summarise\n", " sum = sum + ord(c)-33\n", " mean = sum/ws\n", " \n", " # if mean quality is not enough, break the loop and return end position of window\n", " if(mean >= min_mean): \n", " #print(w, window)\n", " #print(mean)\n", " w = w+1\n", " else:\n", " w = w\n", " break\n", " return w+ws" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "out = []\n", "for l in fastq:\n", " window_size = 10 # define window size\n", " min_qual = 33 # define min mean quality\n", " \n", " # get end position of read satisfying quality check\n", " end = cut_sequence(l[3], min_qual, window_size)\n", " \n", " # check if quality is not sufficient in whole read\n", " if end > window_size:\n", " out.append(l)\n", " out[-1][1] = out[-1][1][:end]\n", " out[-1][3] = out[-1][3][:end]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"test_trimmed.fastq\",\"w\") as f:\n", " for l in out:\n", " f.write(l[0]+'\\n')\n", " f.write(l[1]+'\\n')\n", " f.write(l[2]+'\\n')\n", " f.write(l[3]+'\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }